diff --git a/.clang-tidy b/.clang-tidy
index 9cece0de812b8..06bb0f18e9d2e 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,4 +1,16 @@
-Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-const-correctness,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,-misc-no-recursion,-misc-use-anonymous-namespace,readability-identifier-naming,-misc-include-cleaner'
+Checks: >
+  -*,
+  clang-diagnostic-*,
+  llvm-*,
+  misc-*,
+  -misc-const-correctness,
+  -misc-include-cleaner,
+  -misc-no-recursion,
+  -misc-non-private-member-variables-in-classes,
+  -misc-unused-parameters,
+  -misc-use-anonymous-namespace,
+  readability-identifier-naming
+
 CheckOptions:
   - key:             readability-identifier-naming.ClassCase
     value:           CamelCase
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 197bf083a20c0..efe08ebc221c5 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -2,7 +2,7 @@ FROM docker.io/library/ubuntu:24.04 as base
 ENV LLVM_SYSROOT=/opt/llvm
 
 FROM base as stage1-toolchain
-ENV LLVM_VERSION=20.1.4
+ENV LLVM_VERSION=20.1.8
 
 RUN apt-get update && \
     apt-get install -y \
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index f0bdf6c0b5899..ec937de02ca1a 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -36,8 +36,7 @@ concurrency:
 jobs:
   stage1:
     if: github.repository_owner == 'llvm'
-    runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:b060022103f551d8ca1dad84122ef73927c86512
+    runs-on: llvm-premerge-libcxx-runners
     continue-on-error: false
     strategy:
       fail-fast: false
@@ -74,8 +73,7 @@ jobs:
             **/crash_diagnostics/*
   stage2:
     if: github.repository_owner == 'llvm'
-    runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:2b57ebb50b6d418e70382e655feaa619b558e254
+    runs-on: llvm-premerge-libcxx-runners
     needs: [ stage1 ]
     continue-on-error: false
     strategy:
@@ -149,21 +147,20 @@ jobs:
           'generic-static',
           'bootstrapping-build'
         ]
-        machine: [ 'libcxx-self-hosted-linux' ]
+        machine: [ 'llvm-premerge-libcxx-runners' ]
         include:
         - config: 'generic-cxx26'
-          machine: libcxx-self-hosted-linux
+          machine: llvm-premerge-libcxx-runners
         - config: 'generic-asan'
-          machine: libcxx-self-hosted-linux
+          machine: llvm-premerge-libcxx-runners
         - config: 'generic-tsan'
-          machine: libcxx-self-hosted-linux
+          machine: llvm-premerge-libcxx-runners
         - config: 'generic-ubsan'
-          machine: libcxx-self-hosted-linux
+          machine: llvm-premerge-libcxx-runners
         # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
         - config: 'generic-msan'
-          machine: libcxx-self-hosted-linux
+          machine: llvm-premerge-libcxx-runners
     runs-on: ${{ matrix.machine }}
-    container: ghcr.io/llvm/libcxx-linux-builder:2b57ebb50b6d418e70382e655feaa619b558e254
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: ${{ matrix.config }}
diff --git a/.github/workflows/libcxx-restart-preempted-jobs.yaml b/.github/workflows/libcxx-restart-preempted-jobs.yaml
deleted file mode 100644
index accb84efb5c90..0000000000000
--- a/.github/workflows/libcxx-restart-preempted-jobs.yaml
+++ /dev/null
@@ -1,158 +0,0 @@
-name: Restart Preempted Libc++ Workflow
-
-# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
-# This workflow identifies when a workflow run was canceled due to the VM being preempted,
-# and restarts the workflow run.
-
-# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
-# which should contain the message "The runner has received a shutdown signal."
-
-# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
-
-on:
-  workflow_run:
-    workflows: [Build and Test libc\+\+]
-    types:
-      - completed
-
-permissions:
-  contents: read
-
-jobs:
-  restart:
-    if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure')
-    name: "Restart Job"
-    permissions:
-      statuses: read
-      checks: write
-      actions: write
-    runs-on: ubuntu-24.04
-    steps:
-      - name: "Restart Job"
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
-        with:
-          script: |
-            // The "The run was canceled by" message comes from a user manually canceling a workflow
-            // the "higher priority" message comes from github canceling a workflow because the user updated the change.
-            // And the "exit code 1" message indicates a genuine failure.
-            const failure_regex = /(Process completed with exit code 1.)/
-            const preemption_regex = /(The runner has received a shutdown signal)|(The operation was canceled)/
-
-            const wf_run = context.payload.workflow_run
-            core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
-
-
-            async function create_check_run(conclusion, message) {
-                // Create a check run on the given workflow run to indicate if
-                // we are restarting the workflow or not.
-                if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
-                  core.setFailed('Invalid conclusion: ' + conclusion)
-                }
-                await github.rest.checks.create({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    name: 'Restart Preempted Job',
-                    head_sha: wf_run.head_sha,
-                    status: 'completed',
-                    conclusion: conclusion,
-                    output: {
-                      title: 'Restarted Preempted Job',
-                      summary: message
-                    }
-                })
-            }
-
-            console.log('Listing check runs for suite')
-            const check_suites = await github.rest.checks.listForSuite({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              check_suite_id: context.payload.workflow_run.check_suite_id,
-              per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
-            })
-
-            check_run_ids = [];
-            for (check_run of check_suites.data.check_runs) {
-              console.log('Checking check run: ' + check_run.id);
-              if (check_run.status != 'completed') {
-                console.log('Check run was not completed. Skipping.');
-                continue;
-              }
-              if (check_run.conclusion != 'failure') {
-                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
-                continue;
-              }
-              check_run_ids.push(check_run.id);
-            }
-
-            has_preempted_job = false;
-
-            for (check_run_id of check_run_ids) {
-              console.log('Listing annotations for check run: ' + check_run_id);
-
-              annotations = await github.rest.checks.listAnnotations({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                check_run_id: check_run_id
-              })
-
-              // For temporary debugging purposes to see the structure of the annotations.
-              console.log(annotations);
-
-              has_failed_job = false;
-              saved_failure_message = null;
-
-              for (annotation of annotations.data) {
-                if (annotation.annotation_level != 'failure') {
-                  continue;
-                }
-
-                const preemption_match = annotation.message.match(preemption_regex);
-
-                if (preemption_match != null) {
-                  console.log('Found preemption message: ' + annotation.message);
-                  has_preempted_job = true;
-                }
-
-                const failure_match = annotation.message.match(failure_regex);
-                if (failure_match != null) {
-                  has_failed_job = true;
-                  saved_failure_message = annotation.message;
-                }
-              }
-              if (has_failed_job && (! has_preempted_job)) {
-                // We only want to restart the workflow if all of the failures were due to preemption.
-                // We don't want to restart the workflow if there were other failures.
-                //
-                // However, libcxx runners running inside docker containers produce both a preemption message and failure message.
-                //
-                // The desired approach is to ignore failure messages which appear on the same job as a preemption message
-                // (An job is a single run with a specific configuration, ex generic-gcc, gcc-14).
-                //
-                // However, it's unclear that this code achieves the desired approach, and it may ignore all failures
-                // if a preemption message is found at all on any run.
-                //
-                // For now, it's more important to restart preempted workflows than to avoid restarting workflows with
-                // non-preemption failures.
-                //
-                // TODO Figure this out.
-                core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
-                  'Failure message: "' + saved_failure_message + '"');
-                await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
-                    + 'Failure message: ' + saved_failure_message)
-                return;
-              }
-            }
-
-            if (!has_preempted_job) {
-              core.notice('No preempted jobs found. Not restarting workflow.');
-              await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
-              return;
-            }
-
-            core.notice("Restarted workflow: " + context.payload.workflow_run.id);
-            await github.rest.actions.reRunWorkflowFailedJobs({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                run_id: context.payload.workflow_run.id
-              })
-            await create_check_run('success', 'Restarted workflow run due to preempted job')
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 05d69861e1841..70bcaafbd0cf3 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Install clang-format
         uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
         with:
-          clangformat: 20.1.5
+          clangformat: 20.1.8
 
       - name: Setup Python env
         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 3b7751629564d..f7a48304b82b0 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -63,6 +63,7 @@ jobs:
 
           ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
       - name: Upload Artifacts
+        if: '!cancelled()'
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: Premerge Artifacts (Linux)
@@ -113,6 +114,7 @@ jobs:
           call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
           bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
       - name: Upload Artifacts
+        if: '!cancelled()'
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: Premerge Artifacts (Windows)
diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index f756ae6d897c8..dce34a8434ff8 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -180,6 +180,8 @@ static llvm::Error parseRecord(const Record &R, unsigned ID,
     return decodeRecord(R, I->TagType, Blob);
   case RECORD_IS_TYPE_DEF:
     return decodeRecord(R, I->IsTypeDef, Blob);
+  case RECORD_MANGLED_NAME:
+    return decodeRecord(R, I->MangledName, Blob);
   default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "invalid field for RecordInfo");
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index 3cc0d4ad332f0..eed23726e17bf 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -189,6 +189,7 @@ static const llvm::IndexedMap<RecordIdDsc, RecordIdToIndexFunctor>
           {RECORD_LOCATION, {"Location", &genLocationAbbrev}},
           {RECORD_TAG_TYPE, {"TagType", &genIntAbbrev}},
           {RECORD_IS_TYPE_DEF, {"IsTypeDef", &genBoolAbbrev}},
+          {RECORD_MANGLED_NAME, {"MangledName", &genStringAbbrev}},
           {BASE_RECORD_USR, {"USR", &genSymbolIdAbbrev}},
           {BASE_RECORD_NAME, {"Name", &genStringAbbrev}},
           {BASE_RECORD_PATH, {"Path", &genStringAbbrev}},
@@ -271,7 +272,8 @@ static const std::vector<std::pair<BlockId, std::vector<RecordId>>>
         // Record Block
         {BI_RECORD_BLOCK_ID,
          {RECORD_USR, RECORD_NAME, RECORD_PATH, RECORD_DEFLOCATION,
-          RECORD_LOCATION, RECORD_TAG_TYPE, RECORD_IS_TYPE_DEF}},
+          RECORD_LOCATION, RECORD_TAG_TYPE, RECORD_IS_TYPE_DEF,
+          RECORD_MANGLED_NAME}},
         // BaseRecord Block
         {BI_BASE_RECORD_BLOCK_ID,
          {BASE_RECORD_USR, BASE_RECORD_NAME, BASE_RECORD_PATH,
@@ -616,6 +618,7 @@ void ClangDocBitcodeWriter::emitBlock(const RecordInfo &I) {
   emitRecord(I.USR, RECORD_USR);
   emitRecord(I.Name, RECORD_NAME);
   emitRecord(I.Path, RECORD_PATH);
+  emitRecord(I.MangledName, RECORD_MANGLED_NAME);
   for (const auto &N : I.Namespace)
     emitBlock(N, FieldId::F_namespace);
   for (const auto &CI : I.Description)
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.h b/clang-tools-extra/clang-doc/BitcodeWriter.h
index d09ec4ca34006..501af12582a8e 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.h
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.h
@@ -126,6 +126,7 @@ enum RecordId {
   RECORD_LOCATION,
   RECORD_TAG_TYPE,
   RECORD_IS_TYPE_DEF,
+  RECORD_MANGLED_NAME,
   BASE_RECORD_USR,
   BASE_RECORD_NAME,
   BASE_RECORD_PATH,
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 0e1a0cc347e45..6fdc7196e9095 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -386,6 +386,7 @@ static void serializeInfo(const RecordInfo &I, json::Object &Obj,
   Obj["FullName"] = I.FullName;
   Obj["TagType"] = getTagType(I.TagType);
   Obj["IsTypedef"] = I.IsTypeDef;
+  Obj["MangledName"] = I.MangledName;
 
   if (!I.Children.Functions.empty()) {
     json::Value PubFunctionsArray = Array();
@@ -491,6 +492,23 @@ static void serializeInfo(const NamespaceInfo &I, json::Object &Obj,
   serializeCommonChildren(I.Children, Obj, RepositoryUrl);
 }
 
+static SmallString<16> determineFileName(Info *I, SmallString<128> &Path) {
+  SmallString<16> FileName;
+  if (I->IT == InfoType::IT_record) {
+    auto *RecordSymbolInfo = static_cast<SymbolInfo *>(I);
+    if (RecordSymbolInfo->MangledName.size() < 255)
+      FileName = RecordSymbolInfo->MangledName;
+    else
+      FileName = toStringRef(toHex(RecordSymbolInfo->USR));
+  } else if (I->IT == InfoType::IT_namespace && I->Name != "")
+    // Serialize the global namespace as index.json
+    FileName = I->Name;
+  else
+    FileName = I->getFileBaseName();
+  sys::path::append(Path, FileName + ".json");
+  return FileName;
+}
+
 Error JSONGenerator::generateDocs(
     StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
     const ClangDocContext &CDCtx) {
@@ -501,7 +519,6 @@ Error JSONGenerator::generateDocs(
 
     SmallString<128> Path;
     sys::path::native(RootDir, Path);
-    sys::path::append(Path, Info->getRelativeFilePath(""));
     if (!CreatedDirs.contains(Path)) {
       if (std::error_code Err = sys::fs::create_directories(Path);
           Err != std::error_code())
@@ -509,7 +526,7 @@ Error JSONGenerator::generateDocs(
       CreatedDirs.insert(Path);
     }
 
-    sys::path::append(Path, Info->getFileBaseName() + ".json");
+    SmallString<16> FileName = determineFileName(Info, Path);
     FileToInfos[Path].push_back(Info);
   }
 
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 422a76d99e5b3..beaf314a04ae1 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -290,6 +290,8 @@ void SymbolInfo::merge(SymbolInfo &&Other) {
   auto *Last = llvm::unique(Loc);
   Loc.erase(Last, Loc.end());
   mergeBase(std::move(Other));
+  if (MangledName.empty())
+    MangledName = std::move(Other.MangledName);
 }
 
 NamespaceInfo::NamespaceInfo(SymbolID USR, StringRef Name, StringRef Path)
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index fe5cc48069d58..23f0e90daa27f 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -377,6 +377,7 @@ struct SymbolInfo : public Info {
 
   std::optional<Location> DefLoc;     // Location where this decl is defined.
   llvm::SmallVector<Location, 2> Loc; // Locations where this decl is declared.
+  SmallString<16> MangledName;
   bool IsStatic = false;
 };
 
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index 6cc372ce98a6d..7a0e00c6d9c2d 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/Comment.h"
 #include "clang/AST/DeclFriend.h"
+#include "clang/AST/Mangle.h"
 #include "clang/Index/USRGeneration.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/StringExtras.h"
@@ -767,6 +768,17 @@ static void populateSymbolInfo(SymbolInfo &I, const T *D, const FullComment *C,
     I.DefLoc = Loc;
   else
     I.Loc.emplace_back(Loc);
+
+  auto *Mangler = ItaniumMangleContext::create(
+      D->getASTContext(), D->getASTContext().getDiagnostics());
+  std::string MangledName;
+  llvm::raw_string_ostream MangledStream(MangledName);
+  if (auto *CXXD = dyn_cast<CXXRecordDecl>(D))
+    Mangler->mangleCXXVTable(CXXD, MangledStream);
+  else
+    MangledStream << D->getNameAsString();
+  I.MangledName = MangledName;
+  delete Mangler;
 }
 
 static void
diff --git a/clang-tools-extra/clang-tidy/.clang-tidy b/clang-tools-extra/clang-tidy/.clang-tidy
new file mode 100644
index 0000000000000..2443c979621da
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/.clang-tidy
@@ -0,0 +1,41 @@
+InheritParentConfig: true
+Checks: >
+  bugprone-*,
+  -bugprone-assignment-in-if-condition,
+  -bugprone-branch-clone,
+  -bugprone-easily-swappable-parameters,
+  -bugprone-narrowing-conversions,
+  -bugprone-suspicious-stringview-data-usage,
+  -bugprone-unchecked-optional-access,
+  -bugprone-unused-return-value,
+  modernize-*,
+  -modernize-avoid-c-arrays,
+  -modernize-pass-by-value,
+  -modernize-use-auto,
+  -modernize-use-nodiscard,
+  -modernize-use-trailing-return-type,
+  performance-*,
+  -performance-enum-size,
+  -performance-move-const-arg,
+  -performance-no-int-to-ptr,
+  -performance-type-promotion-in-math-fn,
+  -performance-unnecessary-value-param,
+  readability-*,
+  -readability-avoid-nested-conditional-operator,
+  -readability-avoid-return-with-void-value,
+  -readability-braces-around-statements,
+  -readability-container-contains,
+  -readability-convert-member-functions-to-static,
+  -readability-else-after-return,
+  -readability-function-cognitive-complexity,
+  -readability-identifier-length,
+  -readability-implicit-bool-conversion,
+  -readability-isolate-declaration,
+  -readability-magic-numbers,
+  -readability-named-parameter,
+  -readability-qualified-auto,
+  -readability-redundant-declaration,
+  -readability-simplify-boolean-expr,
+  -readability-static-definition-in-anonymous-namespace,
+  -readability-suspicious-call-argument,
+  -readability-use-anyofallof
diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
index b081c4c479b92..d91e6393a0e85 100644
--- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h
@@ -22,7 +22,7 @@ class UseRangesCheck : public utils::UseRangesCheck {
 public:
   UseRangesCheck(StringRef Name, ClangTidyContext *Context);
 
-  void storeOptions(ClangTidyOptions::OptionMap &Options) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
 
   ReplacerMap getReplacerMap() const override;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index 8cdd5d0a56467..b843e317c471d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -145,7 +145,7 @@ static bool isLikelyTypo(llvm::ArrayRef<ParmVarDecl *> Params,
   std::string ArgNameLowerStr = ArgName.lower();
   StringRef ArgNameLower = ArgNameLowerStr;
   // The threshold is arbitrary.
-  unsigned UpperBound = (ArgName.size() + 2) / 3 + 1;
+  unsigned UpperBound = ((ArgName.size() + 2) / 3) + 1;
   unsigned ThisED = ArgNameLower.edit_distance(
       Params[ArgIndex]->getIdentifier()->getName().lower(),
       /*AllowReplacements=*/true, UpperBound);
diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
index 28e8fe002d575..6565fa3f7c85b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.cpp
@@ -129,13 +129,10 @@ void CrtpConstructorAccessibilityCheck::check(
         << HintFriend;
   }
 
-  auto WithFriendHintIfNeeded =
-      [&](const DiagnosticBuilder &Diag,
-          bool NeedsFriend) -> const DiagnosticBuilder & {
+  auto WithFriendHintIfNeeded = [&](const DiagnosticBuilder &Diag,
+                                    bool NeedsFriend) {
     if (NeedsFriend)
       Diag << HintFriend;
-
-    return Diag;
   };
 
   if (!CRTPDeclaration->hasUserDeclaredConstructor()) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
index 07116a7ff15f5..3c3024d538785 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
@@ -18,8 +18,7 @@ using namespace clang::ast_matchers;
 using clang::ast_matchers::internal::Matcher;
 using clang::tidy::utils::hasPtrOrReferenceInFunc;
 
-namespace clang {
-namespace tidy::bugprone {
+namespace clang::tidy::bugprone {
 
 namespace {
 /// matches a Decl if it has a  "no return" attribute of any kind
@@ -327,5 +326,4 @@ void InfiniteLoopCheck::check(const MatchFinder::MatchResult &Result) {
   }
 }
 
-} // namespace tidy::bugprone
-} // namespace clang
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
index 01276af6c7d8f..879040177079a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroRepeatedSideEffectsCheck.cpp
@@ -153,8 +153,7 @@ unsigned MacroRepeatedPPCallbacks::countArgumentExpansions(
     // Count argument.
     if (TII == Arg) {
       Current++;
-      if (Current > Max)
-        Max = Current;
+      Max = std::max(Max, Current);
     }
   }
   return Max;
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index 88d2f2c388d07..88e048e65d4e8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -370,16 +370,16 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
         << E->getSourceRange();
   } else if (Result.Nodes.getNodeAs<Stmt>("loop-expr")) {
     auto *SizeofArgTy = Result.Nodes.getNodeAs<Type>("sizeof-arg-type");
-    if (const auto member = dyn_cast<MemberPointerType>(SizeofArgTy))
-      SizeofArgTy = member->getPointeeType().getTypePtr();
+    if (const auto *Member = dyn_cast<MemberPointerType>(SizeofArgTy))
+      SizeofArgTy = Member->getPointeeType().getTypePtr();
 
     const auto *SzOfExpr = Result.Nodes.getNodeAs<Expr>("sizeof-expr");
 
-    if (const auto type = dyn_cast<ArrayType>(SizeofArgTy)) {
+    if (const auto *Type = dyn_cast<ArrayType>(SizeofArgTy)) {
       // check if the array element size is larger than one. If true,
       // the size of the array is higher than the number of elements
-      CharUnits sSize = Ctx.getTypeSizeInChars(type->getElementType());
-      if (!sSize.isOne()) {
+      CharUnits SSize = Ctx.getTypeSizeInChars(Type->getElementType());
+      if (!SSize.isOne()) {
         diag(SzOfExpr->getBeginLoc(),
              "suspicious usage of 'sizeof' in the loop")
             << SzOfExpr->getSourceRange();
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index a45949314a4ca..0f2c18ae02663 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -248,7 +248,7 @@ void UnsafeFunctionsCheck::registerMatchers(MatchFinder *Finder) {
     FunctionNames.reserve(CustomFunctions.size());
 
     for (const auto &Entry : CustomFunctions)
-      FunctionNames.push_back(Entry.Name);
+      FunctionNames.emplace_back(Entry.Name);
 
     auto CustomFunctionsMatcher = matchers::matchesAnyListedName(FunctionNames);
 
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index 268b51f76a2c3..82fd3316b942a 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -81,7 +81,7 @@ AST_MATCHER_P(LambdaExpr, hasCaptureDefaultKind, LambdaCaptureDefault, Kind) {
 
 AST_MATCHER(VarDecl, hasIdentifier) {
   const IdentifierInfo *ID = Node.getIdentifier();
-  return ID != NULL && !ID->isPlaceholder();
+  return ID != nullptr && !ID->isPlaceholder();
 }
 
 } // namespace
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
index cf81da816964f..9b2af2a8ca7d8 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
@@ -910,9 +910,9 @@ static bool areExprsSameMacroOrLiteral(const BinaryOperator *BinOp,
     if (Rsr.getBegin().isMacroID()) {
       // Both sides are macros so they are same macro or literal
       const llvm::StringRef L = Lexer::getSourceText(
-          CharSourceRange::getTokenRange(Lsr), SM, Context->getLangOpts(), 0);
+          CharSourceRange::getTokenRange(Lsr), SM, Context->getLangOpts());
       const llvm::StringRef R = Lexer::getSourceText(
-          CharSourceRange::getTokenRange(Rsr), SM, Context->getLangOpts(), 0);
+          CharSourceRange::getTokenRange(Rsr), SM, Context->getLangOpts());
       return areStringsSameIgnoreSpaces(L, R);
     }
     // Left is macro but right is not so they are not same macro or literal
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 9e4d184c4b6e1..e9b96c4016af6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -161,7 +161,7 @@ matchTrailingTemplateParam(const FunctionTemplateDecl *FunctionTemplate) {
 
   const TemplateParameterList *TemplateParams =
       FunctionTemplate->getTemplateParameters();
-  if (TemplateParams->size() == 0)
+  if (TemplateParams->empty())
     return {};
 
   const NamedDecl *LastParam =
@@ -419,7 +419,7 @@ handleTrailingTemplateType(const FunctionTemplateDecl *FunctionTemplate,
   SourceRange RemovalRange;
   const TemplateParameterList *TemplateParams =
       FunctionTemplate->getTemplateParameters();
-  if (!TemplateParams || TemplateParams->size() == 0)
+  if (!TemplateParams || TemplateParams->empty())
     return {};
 
   if (TemplateParams->size() == 1) {
diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
index 2f4cace653cf1..51327dab53e3d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h
@@ -22,7 +22,7 @@ class UseRangesCheck : public utils::UseRangesCheck {
 public:
   UseRangesCheck(StringRef CheckName, ClangTidyContext *Context);
 
-  void storeOptions(ClangTidyOptions::OptionMap &Options) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
 
   ReplacerMap getReplacerMap() const override;
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
index 9c2fc9e06fb45..52e9a9f8d49e0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
@@ -104,8 +104,7 @@ getTemplateLockGuardTypeLoc(const TypeSourceInfo *SourceInfo) {
 static SourceRange getLockGuardRange(const TypeSourceInfo *SourceInfo) {
   const TypeLoc LockGuardTypeLoc = SourceInfo->getTypeLoc();
 
-  return SourceRange(LockGuardTypeLoc.getBeginLoc(),
-                     LockGuardTypeLoc.getEndLoc());
+  return {LockGuardTypeLoc.getBeginLoc(), LockGuardTypeLoc.getEndLoc()};
 }
 
 // Find the exact source range of the 'lock_guard' name token
@@ -115,8 +114,8 @@ static SourceRange getLockGuardNameRange(const TypeSourceInfo *SourceInfo) {
   if (!TemplateLoc)
     return {};
 
-  return SourceRange(TemplateLoc.getTemplateNameLoc(),
-                     TemplateLoc.getLAngleLoc().getLocWithOffset(-1));
+  return {TemplateLoc.getTemplateNameLoc(),
+          TemplateLoc.getLAngleLoc().getLocWithOffset(-1)};
 }
 
 const static StringRef UseScopedLockMessage =
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
index cf2fa1955ca1b..5cabc6df21da9 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.cpp
@@ -33,7 +33,7 @@ UseStdFormatCheck::UseStdFormatCheck(StringRef Name, ClangTidyContext *Context)
                       areDiagsSelfContained()),
       MaybeHeaderToInclude(Options.get("FormatHeader")) {
   if (StrFormatLikeFunctions.empty())
-    StrFormatLikeFunctions.push_back("absl::StrFormat");
+    StrFormatLikeFunctions.emplace_back("absl::StrFormat");
 
   if (!MaybeHeaderToInclude && ReplacementFormatFunction == "std::format")
     MaybeHeaderToInclude = "<format>";
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
index 3b847f51d2173..ffbdb025848d7 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@@ -102,7 +102,7 @@ void PropertyDeclarationCheck::registerMatchers(MatchFinder *Finder) {
 void PropertyDeclarationCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedDecl =
       Result.Nodes.getNodeAs<ObjCPropertyDecl>("property");
-  assert(MatchedDecl->getName().size() > 0);
+  assert(!MatchedDecl->getName().empty());
   auto *DeclContext = MatchedDecl->getDeclContext();
   auto *CategoryDecl = llvm::dyn_cast<ObjCCategoryDecl>(DeclContext);
 
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
index 30df40bda57d8..d6784d0e8fba8 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
@@ -71,7 +71,7 @@ AST_MATCHER(CXXMethodDecl, usesThis) {
   } UsageOfThis;
 
   // TraverseStmt does not modify its argument.
-  UsageOfThis.TraverseStmt(const_cast<Stmt *>(Node.getBody()));
+  UsageOfThis.TraverseStmt(Node.getBody());
 
   return UsageOfThis.Used;
 }
diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
index 85852c2c829a1..aace96f54c61c 100644
--- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
@@ -211,7 +211,7 @@ AST_MATCHER(CXXMethodDecl, usesThisAsConst) {
   FindUsageOfThis UsageOfThis(Finder->getASTContext());
 
   // TraverseStmt does not modify its argument.
-  UsageOfThis.TraverseStmt(const_cast<Stmt *>(Node.getBody()));
+  UsageOfThis.TraverseStmt(Node.getBody());
 
   return UsageOfThis.Usage == Const;
 }
diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
index ea6597dbdd617..6bb8c394f75cc 100644
--- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.cpp
@@ -15,6 +15,17 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
+NamedParameterCheck::NamedParameterCheck(StringRef Name,
+                                         ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      InsertPlainNamesInForwardDecls(
+          Options.get("InsertPlainNamesInForwardDecls", false)) {}
+
+void NamedParameterCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "InsertPlainNamesInForwardDecls",
+                InsertPlainNamesInForwardDecls);
+}
+
 void NamedParameterCheck::registerMatchers(ast_matchers::MatchFinder *Finder) {
   Finder->addMatcher(functionDecl().bind("decl"), this);
 }
@@ -84,7 +95,8 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) {
 
     for (auto P : UnnamedParams) {
       // Fallback to an unused marker.
-      StringRef NewName = "unused";
+      static constexpr StringRef FallbackName = "unused";
+      StringRef NewName = FallbackName;
 
       // If the method is overridden, try to copy the name from the base method
       // into the overrider.
@@ -105,12 +117,25 @@ void NamedParameterCheck::check(const MatchFinder::MatchResult &Result) {
           NewName = Name;
       }
 
-      // Now insert the comment. Note that getLocation() points to the place
+      // Now insert the fix. Note that getLocation() points to the place
       // where the name would be, this allows us to also get complex cases like
       // function pointers right.
       const ParmVarDecl *Parm = P.first->getParamDecl(P.second);
-      D << FixItHint::CreateInsertion(Parm->getLocation(),
-                                      " /*" + NewName.str() + "*/");
+
+      // The fix depends on the InsertPlainNamesInForwardDecls option,
+      // whether this is a forward declaration and whether the parameter has
+      // a real name.
+      const bool IsForwardDeclaration = (!Definition || Function != Definition);
+      if (InsertPlainNamesInForwardDecls && IsForwardDeclaration &&
+          NewName != FallbackName) {
+        // For forward declarations with InsertPlainNamesInForwardDecls enabled,
+        // insert the parameter name without comments.
+        D << FixItHint::CreateInsertion(Parm->getLocation(),
+                                        " " + NewName.str());
+      } else {
+        D << FixItHint::CreateInsertion(Parm->getLocation(),
+                                        " /*" + NewName.str() + "*/");
+      }
     }
   }
 }
diff --git a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
index 812d90ef7319c..f14a74d75eb49 100644
--- a/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NamedParameterCheck.h
@@ -26,13 +26,16 @@ namespace clang::tidy::readability {
 /// Corresponding cpplint.py check name: 'readability/function'.
 class NamedParameterCheck : public ClangTidyCheck {
 public:
-  NamedParameterCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  NamedParameterCheck(StringRef Name, ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   std::optional<TraversalKind> getCheckTraversalKind() const override {
     return TK_IgnoreUnlessSpelledInSource;
   }
+
+private:
+  const bool InsertPlainNamesInForwardDecls;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index 9db059c26d6f2..5a04029e4a6fa 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -217,7 +217,7 @@ static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param,
   SmallVector<int, SmallVectorSize> ArgFlags(ArgLen);
   SmallVector<int, SmallVectorSize> ParamFlags(ParamLen);
   std::ptrdiff_t Range =
-      std::max(std::ptrdiff_t{0}, std::max(ArgLen, ParamLen) / 2 - 1);
+      std::max(std::ptrdiff_t{0}, (std::max(ArgLen, ParamLen) / 2) - 1);
 
   // Calculate matching characters.
   for (std::ptrdiff_t I = 0; I < ParamLen; ++I)
@@ -260,7 +260,7 @@ static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param,
   // Calculate common string prefix up to 4 chars.
   L = 0;
   for (std::ptrdiff_t I = 0;
-       I < std::min(std::min(ArgLen, ParamLen), std::ptrdiff_t{4}); ++I)
+       I < std::min({ArgLen, ParamLen, std::ptrdiff_t{4}}); ++I)
     if (tolower(Arg[I]) == tolower(Param[I]))
       ++L;
 
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
index 6faeb7a0b76e1..6914ec2beb2fb 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
@@ -57,7 +57,7 @@ class AggregateDesignatorNames {
     }
   }
   // Returns false if the type was not an aggregate.
-  operator bool() { return Valid; }
+  operator bool() const { return Valid; }
   // Advance to the next element in the aggregate.
   void next() {
     if (IsArray)
diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
index 5b38ace13e2f2..53ce28e019f75 100644
--- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
+++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp
@@ -239,7 +239,9 @@ class HeaderGuardPPCallbacks : public PPCallbacks {
 
       Check->diag(StartLoc, "header is missing header guard")
           << FixItHint::CreateInsertion(
-                 StartLoc, "#ifndef " + CPPVar + "\n#define " + CPPVar + "\n\n")
+                 StartLoc,
+                 (Twine("#ifndef ") + CPPVar + "\n#define " + CPPVar + "\n\n")
+                     .str())
           << FixItHint::CreateInsertion(
                  SM.getLocForEndOfFile(FID),
                  Check->shouldSuggestEndifComment(FileName)
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 9104723c7f1c0..6cf38ddf3d914 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -30,11 +30,11 @@ template <>
 struct DenseMapInfo<clang::tidy::RenamerClangTidyCheck::NamingCheckId> {
   using NamingCheckId = clang::tidy::RenamerClangTidyCheck::NamingCheckId;
 
-  static inline NamingCheckId getEmptyKey() {
+  static NamingCheckId getEmptyKey() {
     return {DenseMapInfo<clang::SourceLocation>::getEmptyKey(), "EMPTY"};
   }
 
-  static inline NamingCheckId getTombstoneKey() {
+  static NamingCheckId getTombstoneKey() {
     return {DenseMapInfo<clang::SourceLocation>::getTombstoneKey(),
             "TOMBSTONE"};
   }
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
index 16ee7ee79d75e..e421c9f11b24b 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.cpp
@@ -43,7 +43,7 @@ static std::string getFullPrefix(ArrayRef<UseRangesCheck::Indexes> Signature) {
   llvm::raw_string_ostream OS(Output);
   for (const UseRangesCheck::Indexes &Item : Signature)
     OS << Item.BeginArg << ":" << Item.EndArg << ":"
-       << (Item.ReplaceArg == Item.First ? '0' : '1');
+       << (Item.ReplaceArg == UseRangesCheck::Indexes::First ? '0' : '1');
   return Output;
 }
 
@@ -194,7 +194,7 @@ static void removeFunctionArgs(DiagnosticBuilder &Diag, const CallExpr &Call,
 void UseRangesCheck::check(const MatchFinder::MatchResult &Result) {
   Replacer *Replacer = nullptr;
   const FunctionDecl *Function = nullptr;
-  for (auto [Node, Value] : Result.Nodes.getMap()) {
+  for (const auto &[Node, Value] : Result.Nodes.getMap()) {
     StringRef NodeStr(Node);
     if (!NodeStr.consume_front(FuncDecl))
       continue;
diff --git a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
index 3a454bcf0cf07..a5ba6802dd89e 100644
--- a/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
+++ b/clang-tools-extra/clang-tidy/utils/UseRangesCheck.h
@@ -81,7 +81,7 @@ class UseRangesCheck : public ClangTidyCheck {
   void registerMatchers(ast_matchers::MatchFinder *Finder) final;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) final;
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
-  void storeOptions(ClangTidyOptions::OptionMap &Options) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   std::optional<TraversalKind> getCheckTraversalKind() const override;
 
 private:
diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp
index d88aa01aad05d..6658111d6c7b4 100644
--- a/clang-tools-extra/clangd/ModulesBuilder.cpp
+++ b/clang-tools-extra/clangd/ModulesBuilder.cpp
@@ -160,6 +160,16 @@ class ReusablePrerequisiteModules : public PrerequisiteModules {
           RequiredModule->getModuleFilePath().str());
   }
 
+  std::string getAsString() const {
+    std::string Result;
+    llvm::raw_string_ostream OS(Result);
+    for (const auto &ModuleFile : RequiredModules) {
+      OS << "-fmodule-file=" << ModuleFile->getModuleName() << "="
+         << ModuleFile->getModuleFilePath() << " ";
+    }
+    return Result;
+  }
+
   bool canReuse(const CompilerInvocation &CI,
                 llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>) const override;
 
@@ -296,8 +306,27 @@ buildModuleFile(llvm::StringRef ModuleName, PathRef ModuleUnitFileName,
   GenerateReducedModuleInterfaceAction Action;
   Clang->ExecuteAction(Action);
 
-  if (Clang->getDiagnostics().hasErrorOccurred())
-    return llvm::createStringError("Compilation failed");
+  if (Clang->getDiagnostics().hasErrorOccurred()) {
+    std::string Cmds;
+    for (const auto &Arg : Inputs.CompileCommand.CommandLine) {
+      if (!Cmds.empty())
+        Cmds += " ";
+      Cmds += Arg;
+    }
+
+    clangd::vlog("Failed to compile {0} with command: {1}.", ModuleUnitFileName,
+                 Cmds);
+
+    std::string BuiltModuleFilesStr = BuiltModuleFiles.getAsString();
+    if (!BuiltModuleFilesStr.empty())
+      clangd::vlog("The actual used module files built by clangd is {0}",
+                   BuiltModuleFilesStr);
+
+    return llvm::createStringError(
+        llvm::formatv("Failed to compile {0}. Use '--log=verbose' to view "
+                      "detailed failure reasons.",
+                      ModuleUnitFileName));
+  }
 
   return ModuleFile{ModuleName, Inputs.CompileCommand.Output};
 }
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ad869265a2db5..95e6ee1b51334 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -215,7 +215,7 @@ Changes in existing checks
 - Improved :doc:`cppcoreguidelines-missing-std-forward
   <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by adding a
   flag to specify the function used for forwarding instead of ``std::forward``.
-   
+
 - Improved :doc:`cppcoreguidelines-pro-bounds-pointer-arithmetic
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-pointer-arithmetic>` check by
   fixing false positives when calling indexing operators that do not perform
@@ -342,6 +342,11 @@ Changes in existing checks
   false negatives where math expressions are the operand of assignment operators
   or comparison operators.
 
+- Improved :doc:`readability-named-parameter
+  <clang-tidy/checks/readability/named-parameter>` check by adding the option
+  `InsertPlainNamesInForwardDecls` to insert parameter names without comments
+  for forward declarations only.
+
 - Improved :doc:`readability-qualified-auto
   <clang-tidy/checks/readability/qualified-auto>` check by adding the option
   `AllowedTypes`, that excludes specified types from adding qualifiers.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/named-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/named-parameter.rst
index 73677a48605f4..48b7e84d38ec8 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/named-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/named-parameter.rst
@@ -23,3 +23,12 @@ If a parameter is not utilized, its name can be commented out in a function defi
     }
 
 Corresponding cpplint.py check name: `readability/function`.
+
+Options
+-------
+
+.. option:: InsertPlainNamesInForwardDecls
+
+   If set to `true`, the check will insert parameter names without comments for
+   forward declarations only. Otherwise, the check will insert parameter names
+   as comments (e.g., ``/*param*/``). Default is `false`.
diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
index 2dd25771d6d8b..213da93a1adfa 100644
--- a/clang-tools-extra/test/clang-doc/json/class-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.json
+// RUN: FileCheck %s < %t/_ZTV7MyClass.json
 
 template<typename T>
 concept Addable = requires(T a, T b) {
diff --git a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
new file mode 100644
index 0000000000000..e9259edad5cb8
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
@@ -0,0 +1,37 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/_ZTV7MyClass.json --check-prefix=BASE
+// RUN: FileCheck %s < %t/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION
+
+template<typename T> struct MyClass {};
+
+template<> struct MyClass<int> {};
+
+// BASE:       "MangledName": "_ZTV7MyClass",
+// BASE-NEXT:  "Name": "MyClass",
+// BASE-NEXT:  "Namespace": [
+// BASE-NEXT:    "GlobalNamespace"
+// BASE-NEXT:  ],
+// BASE-NEXT:  "Path": "GlobalNamespace",
+// BASE-NEXT:  "TagType": "struct",
+// BASE-NEXT:  "Template": {
+// BASE-NEXT:    "Parameters": [
+// BASE-NEXT:      "typename T"
+// BASE-NEXT:    ]
+// BASE-NEXT:  },
+
+// SPECIALIZATION:       "MangledName": "_ZTV7MyClassIiE",
+// SPECIALIZATION-NEXT:  "Name": "MyClass",
+// SPECIALIZATION-NEXT:  "Namespace": [
+// SPECIALIZATION-NEXT:    "GlobalNamespace"
+// SPECIALIZATION-NEXT:  ],
+// SPECIALIZATION-NEXT:  "Path": "GlobalNamespace",
+// SPECIALIZATION-NEXT:  "TagType": "struct",
+// SPECIALIZATION-NEXT:  "Template": {
+// SPECIALIZATION-NEXT:    "Specialization": {
+// SPECIALIZATION-NEXT:      "Parameters": [
+// SPECIALIZATION-NEXT:        "int"
+// SPECIALIZATION-NEXT:      ],
+// SPECIALIZATION-NEXT:      "SpecializationOf": "{{[0-9A-F]*}}"
+// SPECIALIZATION-NEXT:    }
+// SPECIALIZATION-NEXT:  },
diff --git a/clang-tools-extra/test/clang-doc/json/class-template.cpp b/clang-tools-extra/test/clang-doc/json/class-template.cpp
index fb9c4c2f21c2e..6cdc3e9175278 100644
--- a/clang-tools-extra/test/clang-doc/json/class-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.json
+// RUN: FileCheck %s < %t/_ZTV7MyClass.json
 
 template<typename T> struct MyClass {
   T MemberTemplate;
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index ae47da75edccb..d8317eafea91a 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.json
+// RUN: FileCheck %s < %t/_ZTV7MyClass.json
 
 struct Foo;
 
@@ -134,6 +134,7 @@ struct MyClass {
 // CHECK-NEXT:      "Filename": "{{.*}}class.cpp",
 // CHECK-NEXT:      "LineNumber": 10
 // CHECK-NEXT:    },
+// CHECK-NEXT:    "MangledName": "_ZTV7MyClass",
 // CHECK-NEXT:    "Name": "MyClass",
 // CHECK-NEXT:    "Namespace": [
 // CHECK-NEXT:      "GlobalNamespace"
diff --git a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
index b49dec5cc78c5..34acb6808409d 100644
--- a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
+++ b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+// RUN: FileCheck %s < %t/index.json
 
 template<typename T> concept Incrementable = requires (T a) {
   a++;
diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp
index 887c9d79146a0..b946393274c85 100644
--- a/clang-tools-extra/test/clang-doc/json/concept.cpp
+++ b/clang-tools-extra/test/clang-doc/json/concept.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+// RUN: FileCheck %s < %t/index.json
 
 // Requires that T suports post and pre-incrementing.
 template<typename T>
diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
index 4e8432e088c4f..08ac4c7ed2ca3 100644
--- a/clang-tools-extra/test/clang-doc/json/function-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+// RUN: FileCheck %s < %t/index.json
 
 template<typename T>
 concept Incrementable = requires(T x) {
diff --git a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
index 7005fb7b3e66e..b194e3371bf76 100644
--- a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+// RUN: FileCheck %s < %t/index.json
 
 static void myFunction() {}
 
diff --git a/clang-tools-extra/test/clang-doc/json/method-template.cpp b/clang-tools-extra/test/clang-doc/json/method-template.cpp
index ea9110d6c2d1c..ac8450a72d3a7 100644
--- a/clang-tools-extra/test/clang-doc/json/method-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/method-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.json
+// RUN: FileCheck %s < %t/_ZTV7MyClass.json
 
 struct MyClass {
   template<class T> T methodTemplate(T param) {
diff --git a/clang-tools-extra/test/clang-doc/json/namespace.cpp b/clang-tools-extra/test/clang-doc/json/namespace.cpp
index 6e4fc6938d856..779d7b49f5581 100644
--- a/clang-tools-extra/test/clang-doc/json/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/namespace.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+// RUN: FileCheck %s < %t/index.json
 
 class MyClass {};
 
diff --git a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
index 9b176feb67a00..54f95c4d041ca 100644
--- a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/nested/index.json --check-prefix=NESTED
-// RUN: FileCheck %s < %t/nested/inner/index.json --check-prefix=INNER
+// RUN: FileCheck %s < %t/nested.json --check-prefix=NESTED
+// RUN: FileCheck %s < %t/inner.json --check-prefix=INNER
 
 namespace nested {
   int Global;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
index 50433d5d12ea9..8ae0d7055867b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
@@ -1,29 +1,47 @@
 // RUN: %check_clang_tidy %s readability-named-parameter %t
+// RUN: %check_clang_tidy -check-suffix=PLAIN-NAMES %s readability-named-parameter %t -- \
+// RUN:   -config="{CheckOptions: [{key: readability-named-parameter.InsertPlainNamesInForwardDecls, value: true}]}"
 
 void Method(char *) { /* */ }
 // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: all parameters should be named in a function
 // CHECK-FIXES: void Method(char * /*unused*/) { /* */ }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:19: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void Method(char * /*unused*/) { /* */ }
 void Method2(char *) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: all parameters should be named in a function
 // CHECK-FIXES: void Method2(char * /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:20: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void Method2(char * /*unused*/) {}
 void Method3(char *, void *) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: all parameters should be named in a function
 // CHECK-FIXES: void Method3(char * /*unused*/, void * /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:20: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void Method3(char * /*unused*/, void * /*unused*/) {}
 void Method4(char *, int /*unused*/) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: all parameters should be named in a function
 // CHECK-FIXES: void Method4(char * /*unused*/, int /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:20: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void Method4(char * /*unused*/, int /*unused*/) {}
 void operator delete[](void *) throw() {}
 // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: all parameters should be named in a function
 // CHECK-FIXES: void operator delete[](void * /*unused*/) throw() {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:30: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void operator delete[](void * /*unused*/) throw() {}
 int Method5(int) { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: all parameters should be named in a function
 // CHECK-FIXES: int Method5(int /*unused*/) { return 0; }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:16: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: int Method5(int /*unused*/) { return 0; }
 void Method6(void (*)(void *)) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: all parameters should be named in a function
 // CHECK-FIXES: void Method6(void (* /*unused*/)(void *)) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:21: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void Method6(void (* /*unused*/)(void *)) {}
 template <typename T> void Method7(T) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: all parameters should be named in a function
 // CHECK-FIXES: template <typename T> void Method7(T /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:37: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: template <typename T> void Method7(T /*unused*/) {}
 
 // Don't warn in macros.
 #define M void MethodM(int) {}
@@ -55,6 +73,8 @@ struct Y {
   void foo(T) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: all parameters should be named in a function
 // CHECK-FIXES: void foo(T /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:13: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void foo(T /*unused*/) {}
 };
 
 Y<int> y;
@@ -69,19 +89,27 @@ struct Derived : public Base {
   void foo(int);
 // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: all parameters should be named in a function
 // CHECK-FIXES: void foo(int /*argname*/);
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:15: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void foo(int argname);
 };
 
 void FDef(int);
 // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: all parameters should be named in a function
 // CHECK-FIXES: void FDef(int /*n*/);
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:14: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void FDef(int n);
 void FDef(int n) {}
 
 void FDef2(int, int);
 // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: all parameters should be named in a function
 // CHECK-FIXES: void FDef2(int /*n*/, int /*unused*/);
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:15: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void FDef2(int n, int /*unused*/);
 void FDef2(int n, int) {}
 // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: all parameters should be named in a function
 // CHECK-FIXES: void FDef2(int n, int /*unused*/) {}
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:22: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: void FDef2(int n, int /*unused*/) {}
 
 void FNoDef(int);
 
@@ -91,18 +119,26 @@ Z the_z;
 Z &operator++(Z&) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
 // CHECK-FIXES: Z &operator++(Z& /*unused*/) { return the_z; }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:17: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: Z &operator++(Z& /*unused*/) { return the_z; }
 
 Z &operator++(Z&, int) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
 // CHECK-FIXES: Z &operator++(Z& /*unused*/, int) { return the_z; }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:17: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: Z &operator++(Z& /*unused*/, int) { return the_z; }
 
 Z &operator--(Z&) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
 // CHECK-FIXES: Z &operator--(Z& /*unused*/) { return the_z; }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:17: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: Z &operator--(Z& /*unused*/) { return the_z; }
 
 Z &operator--(Z&, int) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
 // CHECK-FIXES: Z &operator--(Z& /*unused*/, int) { return the_z; }
+// CHECK-MESSAGES-PLAIN-NAMES: :[[@LINE-3]]:17: warning: all parameters should be named in a function
+// CHECK-FIXES-PLAIN-NAMES: Z &operator--(Z& /*unused*/, int) { return the_z; }
 
 namespace testing {
 namespace internal {
diff --git a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
index 09e522133d832..5927235b3bd93 100644
--- a/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/JSONGeneratorTest.cpp
@@ -67,6 +67,7 @@ TEST(JSONGeneratorTest, emitRecordJSON) {
       "IsParent": true,
       "IsTypedef": false,
       "IsVirtual": true,
+      "MangledName": "",
       "Name": "F",
       "Path": "path/to/F",
       "PublicFunctions": [
@@ -112,6 +113,7 @@ TEST(JSONGeneratorTest, emitRecordJSON) {
     "Filename": "main.cpp",
     "LineNumber": 1
   },
+  "MangledName": "",
   "Name": "Foo",
   "Namespace": [
     "GlobalNamespace"
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index c61c808831704..ab374c1886165 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1554,9 +1554,9 @@ the configuration (without a prefix: ``Auto``).
 
     .. code-block:: c++
 
-      #define A                                                                      \
-        int aaaa;                                                                    \
-        int b;                                                                       \
+      #define A                                                            \
+        int aaaa;                                                          \
+        int b;                                                             \
         int dddddddddd;
 
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b6af43c062013..e81a3d4976cf8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -312,6 +312,15 @@ C23 Feature Support
   `WG14 N2975 <https://open-std.org/JTC1/SC22/WG14/www/docs/n2975.pdf>`_
 - Fixed a bug with handling the type operand form of ``typeof`` when it is used
   to specify a fixed underlying type for an enumeration. #GH146351
+- Fixed a rejects-valid bug where Clang would reject an enumeration with an
+  ``_Atomic`` underlying type. The underlying type is the non-atomic,
+  unqualified version of the specified type. Due to the perhaps surprising lack
+  of atomic behavior, this is diagnosed under
+  ``-Wunderlying-atomic-qualifier-ignored``, which defaults to an error. This
+  can be downgraded with ``-Wno-underlying-atomic-qualifier-ignored`` or
+  ``-Wno-error=underlying-atomic-qualifier-ignored``. Clang now also diagnoses
+  cv-qualifiers as being ignored, but that warning does not default to an error.
+  It can be controlled by ``-Wunderlying-cv-qualifier-ignore``. (#GH147736)
 
 C11 Feature Support
 ^^^^^^^^^^^^^^^^^^^
@@ -895,6 +904,7 @@ Bug Fixes to C++ Support
 - Fixed a Clang regression in C++20 mode where unresolved dependent call expressions were created inside non-dependent contexts (#GH122892)
 - Clang now emits the ``-Wunused-variable`` warning when some structured bindings are unused
   and the ``[[maybe_unused]]`` attribute is not applied. (#GH125810)
+- Fixed ``static_cast`` not performing access or ambiguity checks when converting to an rvalue reference to a base class. (#GH121429)
 - Declarations using class template argument deduction with redundant
   parentheses around the declarator are no longer rejected. (#GH39811)
 - Fixed a crash caused by invalid declarations of ``std::initializer_list``. (#GH132256)
@@ -932,6 +942,7 @@ Bug Fixes to C++ Support
 - Fix a bug where private access specifier of overloaded function not respected. (#GH107629)
 - Correctly handles calling an explicit object member function template overload set
   through its address (``(&Foo::bar<baz>)()``).
+- Fix a crash when forming an invalid call to an operator with an explicit object member. (#GH147121)
 - Correctly handle allocations in the condition of a ``if constexpr``.(#GH120197) (#GH134820)
 - Fixed a crash when handling invalid member using-declaration in C++20+ mode. (#GH63254)
 - Fixed parsing of lambda expressions that appear after ``*`` or ``&`` in contexts where a declaration can appear. (#GH63880)
@@ -1074,6 +1085,8 @@ RISC-V Support
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
+* Provide a __device__ version of std::__glibcxx_assert_fail() in a header wrapper.
+
 CUDA Support
 ^^^^^^^^^^^^
 
diff --git a/clang/include/clang/APINotes/APINotesManager.h b/clang/include/clang/APINotes/APINotesManager.h
index 98592438e90ea..772fa5faa0f87 100644
--- a/clang/include/clang/APINotes/APINotesManager.h
+++ b/clang/include/clang/APINotes/APINotesManager.h
@@ -50,6 +50,13 @@ class APINotesManager {
   /// source file from which an entity was declared.
   bool ImplicitAPINotes;
 
+  /// Whether to apply all APINotes as optionally-applied versioned
+  /// entities. This means that when building a Clang module,
+  /// we capture every note on a given decl wrapped in a SwiftVersionedAttr
+  /// (with an empty version field for unversioned notes), and have the
+  /// client apply the relevant version's notes.
+  bool VersionIndependentSwift;
+
   /// The Swift version to use when interpreting versioned API notes.
   llvm::VersionTuple SwiftVersion;
 
@@ -167,6 +174,8 @@ class APINotesManager {
 
   /// Find the API notes readers that correspond to the given source location.
   llvm::SmallVector<APINotesReader *, 2> findAPINotes(SourceLocation Loc);
+
+  bool captureVersionIndependentSwift() { return VersionIndependentSwift; }
 };
 
 } // end namespace api_notes
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 5b2206af75bee..1118d3e062e68 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1776,7 +1776,8 @@ class OMPAtClause final : public OMPClause {
   }
 };
 
-/// This represents 'severity' clause in the '#pragma omp error' directive
+/// This represents the 'severity' clause in the '#pragma omp error' and the
+/// '#pragma omp parallel' directives.
 ///
 /// \code
 /// #pragma omp error severity(fatal)
@@ -1856,7 +1857,8 @@ class OMPSeverityClause final : public OMPClause {
   }
 };
 
-/// This represents 'message' clause in the '#pragma omp error' directive
+/// This represents the 'message' clause in the '#pragma omp error' and the
+/// '#pragma omp parallel' directives.
 ///
 /// \code
 /// #pragma omp error message("GNU compiler required.")
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h
new file mode 100644
index 0000000000000..9998702a41cab
--- /dev/null
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h
@@ -0,0 +1,30 @@
+//===- LifetimeSafety.h - C++ Lifetime Safety Analysis -*----------- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the entry point for a dataflow-based static analysis
+// that checks for C++ lifetime violations.
+//
+// The analysis is based on the concepts of "origins" and "loans" to track
+// pointer lifetimes and detect issues like use-after-free and dangling
+// pointers. See the RFC for more details:
+// https://discourse.llvm.org/t/rfc-intra-procedural-lifetime-analysis-in-clang/86291
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
+#define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
+#include "clang/AST/DeclBase.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Analysis/CFG.h"
+namespace clang {
+
+void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
+                               AnalysisDeclContext &AC);
+
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
diff --git a/clang/include/clang/Analysis/Analyses/UninitializedValues.h b/clang/include/clang/Analysis/Analyses/UninitializedValues.h
index a2b37deddcec2..b151bc3f58321 100644
--- a/clang/include/clang/Analysis/Analyses/UninitializedValues.h
+++ b/clang/include/clang/Analysis/Analyses/UninitializedValues.h
@@ -47,6 +47,9 @@ class UninitUse {
   /// Does this use always see an uninitialized value?
   bool AlwaysUninit;
 
+  /// Is this use a const reference to this variable?
+  bool ConstRefUse = false;
+
   /// This use is always uninitialized if it occurs after any of these branches
   /// is taken.
   SmallVector<Branch, 2> UninitBranches;
@@ -61,10 +64,13 @@ class UninitUse {
 
   void setUninitAfterCall() { UninitAfterCall = true; }
   void setUninitAfterDecl() { UninitAfterDecl = true; }
+  void setConstRefUse() { ConstRefUse = true; }
 
   /// Get the expression containing the uninitialized use.
   const Expr *getUser() const { return User; }
 
+  bool isConstRefUse() const { return ConstRefUse; }
+
   /// The kind of uninitialized use.
   enum Kind {
     /// The use might be uninitialized.
@@ -110,10 +116,6 @@ class UninitVariablesHandler {
   virtual void handleUseOfUninitVariable(const VarDecl *vd,
                                          const UninitUse &use) {}
 
-  /// Called when the uninitialized variable is used as const refernce argument.
-  virtual void handleConstRefUseOfUninitVariable(const VarDecl *vd,
-                                                 const UninitUse &use) {}
-
   /// Called when the uninitialized variable analysis detects the
   /// idiom 'int x = x'.  All other uses of 'x' within the initializer
   /// are handled by handleUseOfUninitVariable.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 0912a004549ae..224cb6a32af28 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3068,6 +3068,26 @@ def Regparm : TypeAttr {
   let ASTNode = 0;
 }
 
+def SwiftType : Attr {
+  // This attribute has no spellings as it is only ever created implicitly
+  // from API notes.
+  let Spellings = [];
+  let Args = [StringArgument<"TypeString">];
+  let SemaHandler = 0;
+  let Documentation = [InternalOnly];
+}
+
+def SwiftNullability : Attr {
+  // This attribute has no spellings as it is only ever created implicitly
+  // from API notes.
+  let Spellings = [];
+  let Args = [EnumArgument<"Kind", "Kind", /*is_string=*/false,
+                           ["non_null", "nullable", "unspecified", "nullable_result"],
+                           ["NonNull", "Nullable", "Unspecified", "NullableResult"]>];
+  let SemaHandler = 0;
+  let Documentation = [InternalOnly];
+}
+
 def SwiftAsyncName : InheritableAttr {
   let Spellings = [GNU<"swift_async_name">];
   let Args = [StringArgument<"Name">];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a5ee8013adff6..4d371a9f7d6db 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -665,6 +665,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "n
 TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
 TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep,  "vIs", "n", "gfx1250-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
+
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index f54a830b0103e..9a7a308600763 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -532,6 +532,9 @@ def Dangling : DiagGroup<"dangling", [DanglingAssignment,
                                       DanglingInitializerList,
                                       DanglingGsl,
                                       ReturnStackAddress]>;
+
+def LifetimeSafety : DiagGroup<"experimental-lifetime-safety">;
+
 def DistributedObjectModifiers : DiagGroup<"distributed-object-modifiers">;
 def DllexportExplicitInstantiationDecl : DiagGroup<"dllexport-explicit-instantiation-decl">;
 def ExcessInitializers : DiagGroup<"excess-initializers">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 934f4453f02b9..3b8f396e37c48 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9365,6 +9365,16 @@ def warn_atomic_implicit_seq_cst : Warning<
   InGroup<DiagGroup<"atomic-implicit-seq-cst">>, DefaultIgnore;
 def err_atomic_unsupported : Error<
   "atomic types are not supported in '%0'">;
+def warn_cv_stripped_in_enum : Warning<
+  "%enum_select<CVQualList>{"
+    "%Both{'const' and 'volatile' qualifiers}|"
+    "%Const{'const' qualifier}|"
+    "%Volatile{'volatile' qualifier}}0 in enumeration underlying type ignored">,
+  InGroup<DiagGroup<"underlying-cv-qualifier-ignored">>;
+def warn_atomic_stripped_in_enum : Warning<
+  "'_Atomic' qualifier ignored; operations involving the enumeration type will "
+  "be non-atomic">,
+  InGroup<DiagGroup<"underlying-atomic-qualifier-ignored">>, DefaultError;
 
 def err_overflow_builtin_must_be_int : Error<
   "operand argument to %select{overflow builtin|checked integer operation}0 "
@@ -10627,6 +10637,10 @@ def warn_dangling_reference_captured_by_unknown : Warning<
    "object whose reference is captured will be destroyed at the end of "
    "the full-expression">, InGroup<DanglingCapture>;
 
+def warn_experimental_lifetime_safety_dummy_warning : Warning<
+   "todo: remove this warning after we have atleast one warning based on the lifetime analysis">, 
+   InGroup<LifetimeSafety>, DefaultIgnore;
+
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
 // Array comparisons have similar warnings
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 72321c204ce96..e43238ba683f2 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -390,6 +390,7 @@ LANGOPT(RetainCommentsFromSystemHeaders, 1, 0, Compatible, "retain documentation
 
 LANGOPT(APINotes, 1, 0, NotCompatible, "use external API notes")
 LANGOPT(APINotesModules, 1, 0, NotCompatible, "use module-based external API notes")
+LANGOPT(SwiftVersionIndependentAPINotes, 1, 0, NotCompatible, "use external API notes capturing all versions")
 
 LANGOPT(SanitizeAddressFieldPadding, 2, 0, NotCompatible, "controls how aggressive is ASan "
                                                       "field padding (0: none, 1:least "
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index f0a8b32bf2f88..b4b94b8816d48 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -19,7 +19,7 @@ include "arm_sve_sme_incl.td"
 // Loads
 
 // Load one vector (scalar base)
-def SVLD1   : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfdm", [IsLoad, VerifyRuntimeMode],               MemEltTyDefault, "aarch64_sve_ld1">;
+def SVLD1   : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfdbm", [IsLoad, VerifyRuntimeMode],               MemEltTyDefault, "aarch64_sve_ld1">;
 def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl",       [IsLoad, VerifyRuntimeMode],               MemEltTyInt8,    "aarch64_sve_ld1">;
 def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl",       [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_ld1">;
 def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl",          [IsLoad, VerifyRuntimeMode],               MemEltTyInt16,   "aarch64_sve_ld1">;
@@ -27,13 +27,8 @@ def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl",          [IsLoad, IsZExtRetu
 def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl",             [IsLoad, VerifyRuntimeMode],               MemEltTyInt32,   "aarch64_sve_ld1">;
 def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_ld1">;
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVLD1_BF      : MInst<"svld1[_{2}]",      "dPc",  "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ld1">;
-  def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ld1">;
-}
-
 // Load one vector (scalar base, VL displacement)
-def SVLD1_VNUM   : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdm", [IsLoad, VerifyRuntimeMode],               MemEltTyDefault, "aarch64_sve_ld1">;
+def SVLD1_VNUM   : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdbm", [IsLoad, VerifyRuntimeMode],               MemEltTyDefault, "aarch64_sve_ld1">;
 def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl",       [IsLoad, VerifyRuntimeMode],               MemEltTyInt8,    "aarch64_sve_ld1">;
 def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl",       [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_ld1">;
 def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsLoad, VerifyRuntimeMode],               MemEltTyInt16,   "aarch64_sve_ld1">;
@@ -121,7 +116,7 @@ def SVLD1UW_GATHER_INDEX_S   : MInst<"svld1uw_gather[_{2}base]_index_{d}", "dPul
 
 
 // First-faulting load one vector (scalar base)
-def SVLDFF1   : MInst<"svldff1[_{2}]", "dPc", "csilUcUsUiUlhfdm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldff1">;
+def SVLDFF1   : MInst<"svldff1[_{2}]", "dPc", "csilUcUsUiUlhfdbm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldff1">;
 def SVLDFF1SB : MInst<"svldff1sb_{d}", "dPS", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ldff1">;
 def SVLDFF1UB : MInst<"svldff1ub_{d}", "dPW", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ldff1">;
 def SVLDFF1SH : MInst<"svldff1sh_{d}", "dPT", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ldff1">;
@@ -130,7 +125,7 @@ def SVLDFF1SW : MInst<"svldff1sw_{d}", "dPU", "lUl",             [IsLoad],
 def SVLDFF1UW : MInst<"svldff1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldff1">;
 
 // First-faulting load one vector (scalar base, VL displacement)
-def SVLDFF1_VNUM   : MInst<"svldff1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldff1">;
+def SVLDFF1_VNUM   : MInst<"svldff1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdbm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldff1">;
 def SVLDFF1SB_VNUM : MInst<"svldff1sb_vnum_{d}", "dPSl", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ldff1">;
 def SVLDFF1UB_VNUM : MInst<"svldff1ub_vnum_{d}", "dPWl", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ldff1">;
 def SVLDFF1SH_VNUM : MInst<"svldff1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ldff1">;
@@ -139,11 +134,6 @@ def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl",             [IsL
 def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldff1">;
 } //  let SVETargetGuard = "sve", SMETargetGuard = InvalidMode
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
-  def SVLDFF1_BF      : MInst<"svldff1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
-  def SVLDFF1_VNUM_BF : MInst<"svldff1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">;
-}
-
 let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // First-faulting load one vector (vector base)
 def SVLDFF1_GATHER_BASES_U   : MInst<"svldff1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">;
@@ -223,7 +213,7 @@ def SVLDFF1SW_GATHER_INDEX_S : MInst<"svldff1sw_gather[_{2}base]_index_{d}", "dP
 def SVLDFF1UW_GATHER_INDEX_S : MInst<"svldff1uw_gather[_{2}base]_index_{d}", "dPul", "lUl",      [IsGatherLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldff1_gather_scalar_offset">;
 
 // Non-faulting load one vector (scalar base)
-def SVLDNF1   : MInst<"svldnf1[_{2}]", "dPc", "csilUcUsUiUlhfdm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldnf1">;
+def SVLDNF1   : MInst<"svldnf1[_{2}]", "dPc", "csilUcUsUiUlhfdbm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldnf1">;
 def SVLDNF1SB : MInst<"svldnf1sb_{d}", "dPS", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ldnf1">;
 def SVLDNF1UB : MInst<"svldnf1ub_{d}", "dPW", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ldnf1">;
 def SVLDNF1SH : MInst<"svldnf1sh_{d}", "dPT", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ldnf1">;
@@ -232,7 +222,7 @@ def SVLDNF1SW : MInst<"svldnf1sw_{d}", "dPU", "lUl",             [IsLoad],
 def SVLDNF1UW : MInst<"svldnf1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldnf1">;
 
 // Non-faulting load one vector (scalar base, VL displacement)
-def SVLDNF1_VNUM   : MInst<"svldnf1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldnf1">;
+def SVLDNF1_VNUM   : MInst<"svldnf1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdbm", [IsLoad],               MemEltTyDefault, "aarch64_sve_ldnf1">;
 def SVLDNF1SB_VNUM : MInst<"svldnf1sb_vnum_{d}", "dPSl", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ldnf1">;
 def SVLDNF1UB_VNUM : MInst<"svldnf1ub_vnum_{d}", "dPWl", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ldnf1">;
 def SVLDNF1SH_VNUM : MInst<"svldnf1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ldnf1">;
@@ -241,34 +231,17 @@ def SVLDNF1SW_VNUM : MInst<"svldnf1sw_vnum_{d}", "dPUl", "lUl",             [IsL
 def SVLDNF1UW_VNUM : MInst<"svldnf1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldnf1">;
 } //  let SVETargetGuard = "sve", SMETargetGuard = InvalidMode
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
-  def SVLDNF1_BF      : MInst<"svldnf1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
-  def SVLDNF1_VNUM_BF : MInst<"svldnf1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">;
-}
-
 // Load one vector, unextended load, non-temporal (scalar base)
-def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfdm", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
+def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfdbm", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
 
 // Load one vector, unextended load, non-temporal (scalar base, VL displacement)
-def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdm", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVLDNT1_BF      : MInst<"svldnt1[_{2}]",      "dPc",  "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
-  def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
-}
+def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfdbm", [IsLoad, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_ldnt1">;
 
 // Load one quadword and replicate (scalar base)
-def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfdm", MergeNone, "aarch64_sve_ld1rq", [VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc",  "b", MergeNone, "aarch64_sve_ld1rq", [VerifyRuntimeMode]>;
-}
+def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfdbm", MergeNone, "aarch64_sve_ld1rq", [VerifyRuntimeMode]>;
 
 multiclass StructLoad<string name, string proto, string i, list<FlagType> f = []> {
-  def : SInst<name, proto, "csilUcUsUiUlhfdm", MergeNone, i, !listconcat(f, [IsStructLoad])>;
-  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i, !listconcat(f, [IsStructLoad])>;
-  }
+  def : SInst<name, proto, "csilUcUsUiUlhfdbm", MergeNone, i, !listconcat(f, [IsStructLoad])>;
 }
 
 // Load N-element structure into N vectors (scalar base)
@@ -283,10 +256,7 @@ defm SVLD4_VNUM : StructLoad<"svld4_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4_sret",
 
 // Load one octoword and replicate (scalar base)
 let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
-  def SVLD1RO : SInst<"svld1ro[_{2}]", "dPc", "csilUcUsUiUlhfdm", MergeNone, "aarch64_sve_ld1ro">;
-}
-let SVETargetGuard = "sve,f64mm,bf16", SMETargetGuard = InvalidMode in {
-  def SVLD1RO_BF16 : SInst<"svld1ro[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1ro">;
+  def SVLD1RO : SInst<"svld1ro[_{2}]", "dPc", "csilUcUsUiUlhfdbm", MergeNone, "aarch64_sve_ld1ro">;
 }
 
 let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
@@ -343,7 +313,7 @@ let SVETargetGuard = "sve2p1|sme2p1", SMETargetGuard = "sve2p1|sme2p1" in {
 // Stores
 
 // Store one vector (scalar base)
-def SVST1    : MInst<"svst1[_{d}]",  "vPpd", "csilUcUsUiUlhfdm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
+def SVST1    : MInst<"svst1[_{d}]",  "vPpd", "csilUcUsUiUlhfdbm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
 def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil",             [IsStore, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_st1">;
 def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl",          [IsStore, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_st1">;
 def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il",              [IsStore, VerifyRuntimeMode], MemEltTyInt16,   "aarch64_sve_st1">;
@@ -352,7 +322,7 @@ def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l",               [IsStore, Verify
 def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul",              [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 
 // Store one vector (scalar base, VL displacement)
-def SVST1_VNUM    : MInst<"svst1_vnum[_{d}]",  "vPpld", "csilUcUsUiUlhfdm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
+def SVST1_VNUM    : MInst<"svst1_vnum[_{d}]",  "vPpld", "csilUcUsUiUlhfdbm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
 def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil",             [IsStore, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_st1">;
 def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl",          [IsStore, VerifyRuntimeMode], MemEltTyInt8,    "aarch64_sve_st1">;
 def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il",              [IsStore, VerifyRuntimeMode], MemEltTyInt16,   "aarch64_sve_st1">;
@@ -360,11 +330,6 @@ def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl",            [IsSt
 def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",               [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",              [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVST1_BF      : MInst<"svst1[_{d}]",      "vPpd",  "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
-  def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_st1">;
-}
-
 let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
 // Store one vector (vector base)
 def SVST1_SCATTER_BASES_U     : MInst<"svst1_scatter[_{2}base_{d}]",  "vPud",  "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">;
@@ -437,11 +402,9 @@ def SVST1W_SCATTER_INDEX_S    : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "v
 } // let SVETargetGuard = "sve"
 
 multiclass StructStore<string name, string proto, string i, list<FlagType> f = []> {
-  def : SInst<name, proto, "csilUcUsUiUlhfdm", MergeNone, i, !listconcat(f, [IsStructStore])>;
-  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i, !listconcat(f, [IsStructStore])>;
-  }
+  def : SInst<name, proto, "csilUcUsUiUlhfdbm", MergeNone, i, !listconcat(f, [IsStructStore])>;
 }
+
 // Store N vectors into N-element structure (scalar base)
 defm SVST2 : StructStore<"svst2[_{d}]", "vPp2", "aarch64_sve_st2", [VerifyRuntimeMode]>;
 defm SVST3 : StructStore<"svst3[_{d}]", "vPp3", "aarch64_sve_st3", [VerifyRuntimeMode]>;
@@ -453,15 +416,10 @@ defm SVST3_VNUM : StructStore<"svst3_vnum[_{d}]", "vPpl3", "aarch64_sve_st3", [V
 defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4", [VerifyRuntimeMode]>;
 
 // Store one vector, with no truncation, non-temporal (scalar base)
-def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfdm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
+def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfdbm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
 
 // Store one vector, with no truncation, non-temporal (scalar base, VL displacement)
-def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfdm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVSTNT1_BF      : MInst<"svstnt1[_{d}]",      "vPpd",  "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
-  def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
-}
+def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfdbm", [IsStore, VerifyRuntimeMode], MemEltTyDefault, "aarch64_sve_stnt1">;
 
 let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // Contiguous truncating store from quadword (single vector).
@@ -563,18 +521,12 @@ def SVADRD : SInst<"svadrd[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "
 // Scalar to vector
 
 def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone, "", [VerifyRuntimeMode]>;
-def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone, "", [VerifyRuntimeMode]>;
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone, "", [VerifyRuntimeMode]>;
-}
+def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUshb", MergeNone, "", [VerifyRuntimeMode]>;
 def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone, "", [VerifyRuntimeMode]>;
 def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone, "", [VerifyRuntimeMode]>;
 
 multiclass svdup_base<string n, string p, MergeType mt, string i> {
-  def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i, [VerifyRuntimeMode]>;
-  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-    def _BF16: SInst<n, p, "b", mt, i, [VerifyRuntimeMode]>;
-  }
+  def NAME : SInst<n, p, "csilUcUsUiUlhfdb", mt, i, [VerifyRuntimeMode]>;
 }
 
 defm SVDUP   : svdup_base<"svdup[_n]_{d}", "ds",   MergeNone,    "aarch64_sve_dup_x">;
@@ -700,10 +652,7 @@ def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeOp1,  "aa
 def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aarch64_sve_asrd", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 
-def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [VerifyRuntimeMode]>;
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr", [VerifyRuntimeMode]>;
-}
+def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_insr", [VerifyRuntimeMode]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer reductions
@@ -786,13 +735,9 @@ multiclass SInstCLS<string name, string types, string intrinsic, list<FlagType>
   def _Z : SInst<name # "[_{d}]", "uPd",  types, MergeZeroExp, intrinsic, flags>;
 }
 
-defm SVCLS : SInstCLS<"svcls", "csil",            "aarch64_sve_cls", [VerifyRuntimeMode]>;
-defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl",    "aarch64_sve_clz", [VerifyRuntimeMode]>;
-defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [VerifyRuntimeMode]>;
-}
+defm SVCLS : SInstCLS<"svcls", "csil",             "aarch64_sve_cls", [VerifyRuntimeMode]>;
+defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl",     "aarch64_sve_clz", [VerifyRuntimeMode]>;
+defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfdb", "aarch64_sve_cnt", [VerifyRuntimeMode]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Conversion
@@ -1034,10 +979,7 @@ def SVCVTXNT_F32_F64   : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aar
 // Permutations and selection
 
 multiclass SVEPerm<string name, string proto, string i> {
-  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [VerifyRuntimeMode]>;
-  let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i, [VerifyRuntimeMode]>;
-  }
+  def : SInst<name, proto, "csilUcUsUiUlhfdb", MergeNone, i, [VerifyRuntimeMode]>;
 }
 
 defm SVCLASTA    : SVEPerm<"svclasta[_{d}]",   "dPdd", "aarch64_sve_clasta">;
@@ -1053,51 +995,26 @@ def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNo
 // splat of any possible lane. It is upto LLVM to pick a more efficient
 // instruction such as DUP (indexed) if the lane index fits the range of the
 // instruction's immediate.
-def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-def SVDUP_LANE_BF16 :
-                   SInst<"svdup_lane[_{d}]",  "ddL",  "b",               MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
-}
-
-def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane", [VerifyRuntimeMode]>;
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane", [VerifyRuntimeMode]>;
-}
-def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckExtract, 1>]>;
+def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
+def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_dupq_lane", [VerifyRuntimeMode]>;
+def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_ext", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckExtract, 1>]>;
 defm SVLASTA     : SVEPerm<"svlasta[_{d}]",   "sPd",  "aarch64_sve_lasta">;
 defm SVLASTB     : SVEPerm<"svlastb[_{d}]",   "sPd",  "aarch64_sve_lastb">;
-def SVREV        : SInst<"svrev[_{d}]",       "dd",   "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [VerifyRuntimeMode]>;
-def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [VerifyRuntimeMode]>;
-def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [VerifyRuntimeMode]>;
-def SVTBL        : SInst<"svtbl[_{d}]",       "ddu",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-  def SVTBL_BF16 : SInst<"svtbl[_{d}]",       "ddu",  "b",               MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
-}
+def SVREV        : SInst<"svrev[_{d}]",       "dd",   "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_rev", [VerifyRuntimeMode]>;
+def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_sel", [VerifyRuntimeMode]>;
+def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_splice", [VerifyRuntimeMode]>;
+def SVTBL        : SInst<"svtbl[_{d}]",       "ddu",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_tbl", [VerifyRuntimeMode]>;
 
-def SVTRN1       : SInst<"svtrn1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [VerifyRuntimeMode]>;
-def SVTRN2       : SInst<"svtrn2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [VerifyRuntimeMode]>;
+def SVTRN1       : SInst<"svtrn1[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1", [VerifyRuntimeMode]>;
+def SVTRN2       : SInst<"svtrn2[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2", [VerifyRuntimeMode]>;
 def SVUNPKHI_S   : SInst<"svunpkhi[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpkhi", [VerifyRuntimeMode]>;
 def SVUNPKHI_U   : SInst<"svunpkhi[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpkhi", [VerifyRuntimeMode]>;
 def SVUNPKLO_S   : SInst<"svunpklo[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpklo", [VerifyRuntimeMode]>;
 def SVUNPKLO_U   : SInst<"svunpklo[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpklo", [VerifyRuntimeMode]>;
-def SVUZP1       : SInst<"svuzp1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [VerifyRuntimeMode]>;
-def SVUZP2       : SInst<"svuzp2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [VerifyRuntimeMode]>;
-def SVZIP1       : SInst<"svzip1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [VerifyRuntimeMode]>;
-def SVZIP2       : SInst<"svzip2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-def SVEXT_BF16    : SInst<"svext[_{d}]",    "dddi", "b", MergeNone, "aarch64_sve_ext", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckExtract, 1>]>;
-def SVREV_BF16    : SInst<"svrev[_{d}]",    "dd",   "b", MergeNone, "aarch64_sve_rev", [VerifyRuntimeMode]>;
-def SVSEL_BF16    : SInst<"svsel[_{d}]",    "dPdd", "b", MergeNone, "aarch64_sve_sel", [VerifyRuntimeMode]>;
-def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [VerifyRuntimeMode]>;
-def SVTRN1_BF16   : SInst<"svtrn1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn1", [VerifyRuntimeMode]>;
-def SVTRN2_BF16   : SInst<"svtrn2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn2", [VerifyRuntimeMode]>;
-def SVUZP1_BF16   : SInst<"svuzp1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp1", [VerifyRuntimeMode]>;
-def SVUZP2_BF16   : SInst<"svuzp2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp2", [VerifyRuntimeMode]>;
-def SVZIP1_BF16   : SInst<"svzip1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip1", [VerifyRuntimeMode]>;
-def SVZIP2_BF16   : SInst<"svzip2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip2", [VerifyRuntimeMode]>;
-}
+def SVUZP1       : SInst<"svuzp1[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1", [VerifyRuntimeMode]>;
+def SVUZP2       : SInst<"svuzp2[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp2", [VerifyRuntimeMode]>;
+def SVZIP1       : SInst<"svzip1[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_zip1", [VerifyRuntimeMode]>;
+def SVZIP2       : SInst<"svzip2[_{d}]",      "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_zip2", [VerifyRuntimeMode]>;
 
 def SVREV_B8   : SInst<"svrev_b8",     "PP",   "Pc", MergeNone, "aarch64_sve_rev", [VerifyRuntimeMode]>;
 def SVREV_B16  : SInst<"svrev_b16",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b16",  [IsOverloadNone, VerifyRuntimeMode]>;
@@ -1200,11 +1117,7 @@ def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendS
 def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, VerifyRuntimeMode]>;
 
 def SVCNTP : SInst<"svcntp_{d}",  "nPP", "PcPsPiPl",        MergeNone, "aarch64_sve_cntp", [VerifyRuntimeMode]>;
-def SVLEN  : SInst<"svlen[_{d}]", "nd",  "csilUcUsUiUlhfd", MergeNone, "", [VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [VerifyRuntimeMode]>;
-}
+def SVLEN  : SInst<"svlen[_{d}]", "nd",  "csilUcUsUiUlhfdb", MergeNone, "", [VerifyRuntimeMode]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Saturating scalar arithmetic
@@ -1290,44 +1203,24 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
 
 let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">;
-def SVTRN1Q      : SInst<"svtrn1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">;
-def SVTRN2Q      : SInst<"svtrn2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">;
-def SVUZP1Q      : SInst<"svuzp1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">;
-def SVUZP2Q      : SInst<"svuzp2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">;
-def SVZIP1Q      : SInst<"svzip1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">;
-def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">;
-}
-
-let SVETargetGuard = "sve,bf16,f64mm", SMETargetGuard = InvalidMode in {
-def SVTRN1Q_BF16      : SInst<"svtrn1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn1q">;
-def SVTRN2Q_BF16      : SInst<"svtrn2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn2q">;
-def SVUZP1Q_BF16      : SInst<"svuzp1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp1q">;
-def SVUZP2Q_BF16      : SInst<"svuzp2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp2q">;
-def SVZIP1Q_BF16      : SInst<"svzip1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip1q">;
-def SVZIP2Q_BF16      : SInst<"svzip2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip2q">;
+def SVTRN1Q      : SInst<"svtrn1q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
+def SVTRN2Q      : SInst<"svtrn2q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;
+def SVUZP1Q      : SInst<"svuzp1q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">;
+def SVUZP2Q      : SInst<"svuzp2q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp2q">;
+def SVZIP1Q      : SInst<"svzip1q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_zip1q">;
+def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_zip2q">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector creation
-def SVUNDEF_1 : SInst<"svundef_{d}",  "dv", "csilUcUsUiUlhfdm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfdm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfdm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfdm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
+def SVUNDEF_1 : SInst<"svundef_{d}",  "dv", "csilUcUsUiUlhfdbm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
+def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfdbm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
+def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfdbm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
+def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfdbm", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
 
-def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd",   "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd",  "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-def SVUNDEF_1_BF16 : SInst<"svundef_{d}",  "dv", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, VerifyRuntimeMode]>;
-
-def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd",   "b", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
-}
+def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd",   "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
+def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd",  "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
+def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
 
 let SVETargetGuard = "sve2p1|sme2", SMETargetGuard = "sve2p1|sme2" in {
   def SVCREATE_2_B : SInst<"svcreate2[_b]", "2dd",   "Pc", MergeNone, "", [IsTupleCreate, VerifyRuntimeMode]>;
@@ -1336,23 +1229,13 @@ let SVETargetGuard = "sve2p1|sme2", SMETargetGuard = "sve2p1|sme2" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector insertion and extraction
-def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
-def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
-def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
+def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
+def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
+def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 
-def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
-def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
-def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfdm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
-
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
-def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
-def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
-def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
-
-def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
-def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
-def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
-}
+def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
+def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_2>]>;
+def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfdbm", MergeNone, "", [IsTupleSet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
 
 let SVETargetGuard = "sve2p1|sme2", SMETargetGuard = "sve2p1|sme2" in {
   def SVGET_2_B : SInst<"svget2[_b]", "d2i", "Pc", MergeNone, "", [IsTupleGet, VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
@@ -1922,49 +1805,33 @@ def SVNMATCH : SInst<"svnmatch[_{d}]", "PPdd", "csUcUs", MergeNone, "aarch64_sve
 // SVE2 - Contiguous conflict detection
 let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
 def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, VerifyRuntimeMode]>;
-def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
+def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUshb", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 
 def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, VerifyRuntimeMode]>;
-def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
+def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUshb", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 }
 
-let SVETargetGuard = "sve2,bf16", SMETargetGuard = "sme,bf16" in {
-def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
-def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Extended table lookup/permute
 let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
-def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfd", MergeNone, "", [VerifyRuntimeMode]>;
-def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
-}
-
-let SVETargetGuard = "sve2,bf16", SMETargetGuard = "sme,bf16" in {
-def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [VerifyRuntimeMode]>;
-def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
+def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfdb", MergeNone, "", [VerifyRuntimeMode]>;
+def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Lookup table
 let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
   def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUshb", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
 
   def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
-  def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUshb", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 
-  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
-}
-
-let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
-  def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
-  def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -2322,15 +2189,11 @@ let SVETargetGuard = "sve2p1|sme2p1", SMETargetGuard = "sve2p1|sme2p1" in {
 let SVETargetGuard = "sve2p1|sme2p1", SMETargetGuard = "sve2p1|sme2p1" in {
   // DUPQ
   def SVDUP_LANEQ_B  : SInst<"svdup_laneq[_{d}]", "ddi",  "cUcm", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_15>]>;
-  def SVDUP_LANEQ_H  : SInst<"svdup_laneq[_{d}]", "ddi",  "sUsh", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
+  def SVDUP_LANEQ_H  : SInst<"svdup_laneq[_{d}]", "ddi",  "sUshb", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
   def SVDUP_LANEQ_S  : SInst<"svdup_laneq[_{d}]", "ddi",  "iUif", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_3>]>;
   def SVDUP_LANEQ_D  : SInst<"svdup_laneq[_{d}]", "ddi",  "lUld", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_1>]>;
 }
 
-let SVETargetGuard = "(sve2p1|sme2p1),bf16", SMETargetGuard = "(sve2p1|sme2p1),bf16" in {
-  def SVDUP_LANEQ_BF16  : SInst<"svdup_laneq[_{d}]", "ddi",  "b", MergeNone, "aarch64_sve_dup_laneq", [VerifyRuntimeMode], [ImmCheck<1, ImmCheck0_7>]>;
-}
-
 //
 // Multi-vector convert to/from floating-point.
 //
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index d800028cdcee5..275bb2b9924dd 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -499,7 +499,7 @@ multiclass RVVPseudoVWCVTBuiltin<string IR, string MName, string type_range,
           if (PolicyAttrs & RVV_VTA)
             Ops.insert(Ops.begin(), llvm::PoisonValue::get(ResultType));
         }
-        auto ElemTy = cast<llvm::VectorType>(ResultType)->getElementType();
+        auto ElemTy = cast<llvm::VectorType>(Ops[1]->getType())->getElementType();
         Ops.insert(Ops.begin() + 2, llvm::Constant::getNullValue(ElemTy));
         if (IsMasked) {
           Ops.push_back(ConstantInt::get(Ops.back()->getType(), PolicyAttrs));
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 6529f1386599c..99fcb322a42d5 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1669,6 +1669,94 @@ def GetGlobalOp : CIR_Op<"get_global",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// SetBitfieldOp
+//===----------------------------------------------------------------------===//
+
+def SetBitfieldOp : CIR_Op<"set_bitfield"> {
+  let summary = "Set the value of a bitfield member";
+  let description = [{
+    The `cir.set_bitfield` operation provides a store-like access to
+    a bit field of a record.
+
+    A bitfield info attribute must be provided to describe the location of
+    the bitfield within the memory referenced by the $addr argument.
+    The $src argument is inserted at the appropriate place in the memory and
+    the value that was stored. Returns the value being stored.
+
+    A unit attribute `volatile` can be used to indicate a volatile store of the
+    bitfield.
+      ```mlir
+        cir.set_bitfield(#bfi, %0 : !cir.ptr<!u32i>, %1 : !s32i) {is_volatile}
+                                                                       -> !s32i
+      ```
+
+    Example.
+    Suppose we have a struct with multiple bitfields stored in
+    different storages. The `cir.set_bitfield` operation sets the value
+    of the bitfield.
+    ```C++
+    typedef struct {
+      int a : 4;
+      int b : 27;
+      int c : 17;
+      int d : 2;
+      int e : 15;
+    } S;
+
+    void store_bitfield(S& s) {
+      s.e = 3;
+    }
+    ```
+
+    ```mlir
+    // 'e' is in the storage with the index 1
+    !record_type = !cir.record<struct "S" packed padded {!u64i, !u16i,
+                               !cir.array<!u8i x 2>} #cir.record.decl.ast>
+    #bfi_e = #cir.bitfield_info<name = "e", storage_type = !u16i, size = 15,
+                                offset = 0, is_signed = true>
+
+    %1 = cir.const #cir.int<3> : !s32i
+    %2 = cir.load %0 : !cir.ptr<!cir.ptr<!record_type>>, !cir.ptr<!record_type>
+    %3 = cir.get_member %2[1] {name = "e"} : !cir.ptr<!record_type>
+                                                             -> !cir.ptr<!u16i>
+    %4 = cir.set_bitfield(#bfi_e, %3 : !cir.ptr<!u16i>, %1 : !s32i) -> !s32i
+    ```
+   }];
+
+  let arguments = (ins
+    Arg<CIR_PointerType, "the address to store the value", [MemWrite]>:$addr,
+    CIR_AnyType:$src,
+    BitfieldInfoAttr:$bitfield_info,
+    UnitAttr:$is_volatile
+  );
+
+  let results = (outs CIR_IntType:$result);
+
+  let assemblyFormat = [{ `(`$bitfield_info`,` $addr`:`qualified(type($addr))`,`
+    $src`:`type($src) `)`  attr-dict `->` type($result) }];
+
+  let builders = [
+    OpBuilder<(ins "mlir::Type":$type,
+                   "mlir::Value":$addr,
+                   "mlir::Type":$storage_type,
+                   "mlir::Value":$src,
+                   "llvm::StringRef":$name,
+                   "unsigned":$size,
+                   "unsigned":$offset,
+                   "bool":$is_signed,
+                   "bool":$is_volatile
+                   ),
+   [{
+      BitfieldInfoAttr info =
+        BitfieldInfoAttr::get($_builder.getContext(),
+                              name, storage_type,
+                              size, offset, is_signed);
+      build($_builder, $_state, type, addr, src, info, is_volatile);
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // GetBitfieldOp
 //===----------------------------------------------------------------------===//
@@ -1685,6 +1773,9 @@ def GetBitfieldOp : CIR_Op<"get_bitfield"> {
 
     A unit attribute `volatile` can be used to indicate a volatile load of the
     bitfield.
+    ```mlir
+      cir.get_bitfield(#bfi, %0 {is_volatile} : !cir.ptr<!u64i>) -> !s32i
+    ```
 
     Example:
     Suppose we have a struct with multiple bitfields stored in
@@ -2521,6 +2612,62 @@ def ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexRealPtrOp
+//===----------------------------------------------------------------------===//
+
+def ComplexRealPtrOp : CIR_Op<"complex.real_ptr", [Pure]> {
+  let summary = "Derive a pointer to the real part of a complex value";
+  let description = [{
+    `cir.complex.real_ptr` operation takes a pointer operand that points to a
+    complex value of type `!cir.complex` and yields a pointer to the real part
+    of the operand.
+
+    Example:
+
+    ```mlir
+    %1 = cir.complex.real_ptr %0 : !cir.ptr<!cir.complex<!cir.double>>
+      -> !cir.ptr<!cir.double>
+    ```
+  }];
+
+  let results = (outs CIR_PtrToIntOrFloatType:$result);
+  let arguments = (ins CIR_PtrToComplexType:$operand);
+
+  let assemblyFormat = [{
+    $operand `:`
+    qualified(type($operand)) `->` qualified(type($result)) attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// ComplexAddOp
+//===----------------------------------------------------------------------===//
+
+def ComplexAddOp : CIR_Op<"complex.add", [Pure, SameOperandsAndResultType]> {
+  let summary = "Complex addition";
+  let description = [{
+    The `cir.complex.add` operation takes two complex numbers and returns
+    their sum.
+
+    Example:
+
+    ```mlir
+    %2 = cir.complex.add %0, %1 : !cir.complex<!cir.float>
+    ```
+  }];
+
+  let arguments = (ins CIR_ComplexType:$lhs, CIR_ComplexType:$rhs);
+
+  let results = (outs CIR_ComplexType:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs `:` qualified(type($result)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Bit Manipulation Operations
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index bcd516e27cc76..2bf77583465a6 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -159,6 +159,12 @@ def CIR_AnyIntOrFloatType : AnyTypeOf<[CIR_AnyFloatType, CIR_AnyIntType],
     let cppFunctionName = "isAnyIntegerOrFloatingPointType";
 }
 
+//===----------------------------------------------------------------------===//
+// Complex Type predicates
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyComplexType : CIR_TypeBase<"::cir::ComplexType", "complex type">;
+
 //===----------------------------------------------------------------------===//
 // Pointer Type predicates
 //===----------------------------------------------------------------------===//
@@ -180,6 +186,17 @@ class CIR_PtrToPtrTo<code type, string summary>
     : CIR_ConfinedType<CIR_AnyPtrType, [CIR_IsPtrToPtrToPred<type>],
         "pointer to pointer to " # summary>;
 
+// Pointee type constraint bases
+class CIR_PointeePred<Pred pred> : SubstLeaves<"$_self",
+      "::mlir::cast<::cir::PointerType>($_self).getPointee()", pred>;
+
+class CIR_PtrToAnyOf<list<Type> types, string summary = "">
+: CIR_ConfinedType<CIR_AnyPtrType,
+  [Or<!foreach(type, types, CIR_PointeePred<type.predicate>)>],
+  !if(!empty(summary),
+      "pointer to " # CIR_TypeSummaries<types>.value,
+      summary)>;
+
 // Void pointer type constraints
 def CIR_VoidPtrType
     : CIR_PtrTo<"::cir::VoidType", "void type">,
@@ -192,6 +209,13 @@ def CIR_PtrToVoidPtrType
         "$_builder.getType<" # cppType # ">("
         "cir::VoidType::get($_builder.getContext())))">;
 
+class CIR_PtrToType<Type type> : CIR_PtrToAnyOf<[type]>;
+
+// Pointer to type constraints
+def CIR_PtrToIntOrFloatType : CIR_PtrToType<CIR_AnyIntOrFloatType>;
+
+def CIR_PtrToComplexType : CIR_PtrToType<CIR_AnyComplexType>;
+
 //===----------------------------------------------------------------------===//
 // Vector Type predicates
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 928a37785ee16..b1314f2c53a79 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1891,6 +1891,12 @@ defm apinotes_modules : BoolOption<"f", "apinotes-modules",
     NegFlag<SetFalse, [], [ClangOption], "Disable">,
     BothFlags<[], [ClangOption, CC1Option], " module-based external API notes support">>,
     Group<f_clang_Group>;
+defm swift_version_independent_apinotes : BoolOption<"f", "swift-version-independent-apinotes",
+  LangOpts<"SwiftVersionIndependentAPINotes">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption], "Enable">,
+  NegFlag<SetFalse, [], [ClangOption], "Disable">,
+  BothFlags<[], [ClangOption, CC1Option], " version-independent external API notes support">>,
+  Group<f_clang_Group>;
 def fapinotes_swift_version : Joined<["-"], "fapinotes-swift-version=">,
   Group<f_clang_Group>, Visibility<[ClangOption, CC1Option]>,
   MetaVarName<"<version>">,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 5d1fdb153b26e..74b516fe4f071 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -513,9 +513,9 @@ struct FormatStyle {
     ENAS_LeftWithLastLine,
     /// Align escaped newlines in the right-most column.
     /// \code
-    ///   #define A                                                                      \
-    ///     int aaaa;                                                                    \
-    ///     int b;                                                                       \
+    ///   #define A                                                            \
+    ///     int aaaa;                                                          \
+    ///     int b;                                                             \
     ///     int dddddddddd;
     /// \endcode
     ENAS_Right,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 443a389ab6a18..b331acbe606b7 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1614,7 +1614,17 @@ class Sema final : public SemaBase {
   ///
   /// Triggered by declaration-attribute processing.
   void ProcessAPINotes(Decl *D);
-
+  /// Apply the 'Nullability:' annotation to the specified declaration
+  void ApplyNullability(Decl *D, NullabilityKind Nullability);
+  /// Apply the 'Type:' annotation to the specified declaration
+  void ApplyAPINotesType(Decl *D, StringRef TypeString);
+
+  /// Whether APINotes should be gathered for all applicable Swift language
+  /// versions, without being applied. Leaving clients of the current module
+  /// to select and apply the correct version.
+  bool captureSwiftVersionIndependentAPINotes() {
+    return APINotes.captureVersionIndependentSwift();
+  }
   ///@}
 
   //
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index ee24e5d1543d3..c3601a4e73e1f 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -57,6 +57,10 @@ struct TranslationUnitDeps {
   /// determined that the differences are benign for this compilation.
   std::vector<ModuleID> ClangModuleDeps;
 
+  /// A list of module names that are visible to this translation unit. This
+  /// includes both direct and transitive module dependencies.
+  std::vector<std::string> VisibleModules;
+
   /// A list of the C++20 named modules this translation unit depends on.
   std::vector<std::string> NamedModuleDeps;
 
@@ -150,7 +154,7 @@ class DependencyScanningTool {
   /// Given a compilation context specified via the Clang driver command-line,
   /// gather modular dependencies of module with the given name, and return the
   /// information needed for explicit build.
-  llvm::Expected<ModuleDepsGraph> getModuleDependencies(
+  llvm::Expected<TranslationUnitDeps> getModuleDependencies(
       StringRef ModuleName, const std::vector<std::string> &CommandLine,
       StringRef CWD, const llvm::DenseSet<ModuleID> &AlreadySeen,
       LookupModuleOutputCallback LookupModuleOutput);
@@ -188,6 +192,10 @@ class FullDependencyConsumer : public DependencyConsumer {
     DirectModuleDeps.push_back(ID);
   }
 
+  void handleVisibleModule(std::string ModuleName) override {
+    VisibleModules.push_back(ModuleName);
+  }
+
   void handleContextHash(std::string Hash) override {
     ContextHash = std::move(Hash);
   }
@@ -201,7 +209,6 @@ class FullDependencyConsumer : public DependencyConsumer {
   }
 
   TranslationUnitDeps takeTranslationUnitDeps();
-  ModuleDepsGraph takeModuleGraphDeps();
 
 private:
   std::vector<std::string> Dependencies;
@@ -210,6 +217,7 @@ class FullDependencyConsumer : public DependencyConsumer {
   std::string ModuleName;
   std::vector<std::string> NamedModuleDeps;
   std::vector<ModuleID> DirectModuleDeps;
+  std::vector<std::string> VisibleModules;
   std::vector<Command> Commands;
   std::string ContextHash;
   std::vector<std::string> OutputPaths;
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
index 3e232c79397ce..6060e4b43312e 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h
@@ -59,6 +59,8 @@ class DependencyConsumer {
 
   virtual void handleDirectModuleDependency(ModuleID MD) = 0;
 
+  virtual void handleVisibleModule(std::string ModuleName) = 0;
+
   virtual void handleContextHash(std::string Hash) = 0;
 };
 
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index e96c49883d3c6..4136cb73f7043 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -323,6 +323,11 @@ class ModuleDepCollector final : public DependencyCollector {
   llvm::MapVector<const Module *, PrebuiltModuleDep> DirectPrebuiltModularDeps;
   /// Working set of direct modular dependencies.
   llvm::SetVector<const Module *> DirectModularDeps;
+  /// Working set of direct modular dependencies, as they were imported.
+  llvm::SmallPtrSet<const Module *, 32> DirectImports;
+  /// All direct and transitive visible modules.
+  llvm::StringSet<> VisibleModules;
+
   /// Options that control the dependency output generation.
   std::unique_ptr<DependencyOutputOptions> Opts;
   /// A Clang invocation that's based on the original TU invocation and that has
@@ -337,6 +342,9 @@ class ModuleDepCollector final : public DependencyCollector {
   /// Checks whether the module is known as being prebuilt.
   bool isPrebuiltModule(const Module *M);
 
+  /// Computes all visible modules resolved from direct imports.
+  void addVisibleModules();
+
   /// Adds \p Path to \c FileDeps, making it absolute if necessary.
   void addFileDep(StringRef Path);
   /// Adds \p Path to \c MD.FileDeps, making it absolute if necessary.
diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp
index 4dc6ffd66bd53..60868ab104c46 100644
--- a/clang/lib/APINotes/APINotesManager.cpp
+++ b/clang/lib/APINotes/APINotesManager.cpp
@@ -49,7 +49,8 @@ class PrettyStackTraceDoubleString : public llvm::PrettyStackTraceEntry {
 } // namespace
 
 APINotesManager::APINotesManager(SourceManager &SM, const LangOptions &LangOpts)
-    : SM(SM), ImplicitAPINotes(LangOpts.APINotes) {}
+    : SM(SM), ImplicitAPINotes(LangOpts.APINotes),
+      VersionIndependentSwift(LangOpts.SwiftVersionIndependentAPINotes) {}
 
 APINotesManager::~APINotesManager() {
   // Free the API notes readers.
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index e7c085750b7ad..afa3b7ea7de7e 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6532,14 +6532,13 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
   if (DiscardResult)
     return true;
 
-  if (const auto *ECD = dyn_cast<EnumConstantDecl>(D)) {
+  if (const auto *ECD = dyn_cast<EnumConstantDecl>(D))
     return this->emitConst(ECD->getInitVal(), E);
-  } else if (const auto *BD = dyn_cast<BindingDecl>(D)) {
-    return this->visit(BD->getBinding());
-  } else if (const auto *FuncDecl = dyn_cast<FunctionDecl>(D)) {
+  if (const auto *FuncDecl = dyn_cast<FunctionDecl>(D)) {
     const Function *F = getFunction(FuncDecl);
     return F && this->emitGetFnPtr(F, E);
-  } else if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(D)) {
+  }
+  if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(D)) {
     if (std::optional<unsigned> Index = P.getOrCreateGlobal(D)) {
       if (!this->emitGetPtrGlobal(*Index, E))
         return false;
@@ -6560,13 +6559,15 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
   // value.
   bool IsReference = D->getType()->isReferenceType();
 
-  // Check for local/global variables and parameters.
+  // Local variables.
   if (auto It = Locals.find(D); It != Locals.end()) {
     const unsigned Offset = It->second.Offset;
     if (IsReference)
       return this->emitGetLocal(classifyPrim(E), Offset, E);
     return this->emitGetPtrLocal(Offset, E);
-  } else if (auto GlobalIndex = P.getGlobal(D)) {
+  }
+  // Global variables.
+  if (auto GlobalIndex = P.getGlobal(D)) {
     if (IsReference) {
       if (!Ctx.getLangOpts().CPlusPlus11)
         return this->emitGetGlobal(classifyPrim(E), *GlobalIndex, E);
@@ -6574,7 +6575,9 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     }
 
     return this->emitGetPtrGlobal(*GlobalIndex, E);
-  } else if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
+  }
+  // Function parameters.
+  if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
     if (auto It = this->Params.find(PVD); It != this->Params.end()) {
       if (IsReference || !It->second.IsPtr)
         return this->emitGetParam(classifyPrim(E), It->second.Offset, E);
@@ -6600,7 +6603,7 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     return this->visitDeclRef(D, E);
   };
 
-  // Handle lambda captures.
+  // Lambda captures.
   if (auto It = this->LambdaCaptures.find(D);
       It != this->LambdaCaptures.end()) {
     auto [Offset, IsPtr] = It->second;
@@ -6608,12 +6611,17 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     if (IsPtr)
       return this->emitGetThisFieldPtr(Offset, E);
     return this->emitGetPtrThisField(Offset, E);
-  } else if (const auto *DRE = dyn_cast<DeclRefExpr>(E);
-             DRE && DRE->refersToEnclosingVariableOrCapture()) {
+  }
+
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(E);
+      DRE && DRE->refersToEnclosingVariableOrCapture()) {
     if (const auto *VD = dyn_cast<VarDecl>(D); VD && VD->isInitCapture())
       return revisit(VD);
   }
 
+  if (const auto *BD = dyn_cast<BindingDecl>(D))
+    return this->visit(BD->getBinding());
+
   // Avoid infinite recursion.
   if (D == InitializingDecl)
     return this->emitDummyPtr(D, E);
@@ -6666,7 +6674,7 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     if (VD->evaluateValue())
       return revisit(VD);
 
-    if (!D->getType()->isReferenceType())
+    if (!IsReference)
       return this->emitDummyPtr(D, E);
 
     return this->emitInvalidDeclRef(cast<DeclRefExpr>(E),
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index be77657acabcc..457de2bed37d6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -445,13 +445,7 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
   assert(Desc);
 
   const auto *D = Desc->asVarDecl();
-  if (!D || !D->hasGlobalStorage())
-    return true;
-
-  if (D == S.EvaluatingDecl)
-    return true;
-
-  if (D->isConstexpr())
+  if (!D || D == S.EvaluatingDecl || D->isConstexpr())
     return true;
 
   // If we're evaluating the initializer for a constexpr variable in C23, we may
@@ -576,23 +570,14 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!Ptr.isConst() || Ptr.isMutable())
     return true;
 
-  // The This pointer is writable in constructors and destructors,
-  // even if isConst() returns true.
-  // TODO(perf): We could be hitting this code path quite a lot in complex
-  // constructors. Is there a better way to do this?
-  if (S.Current->getFunction()) {
-    for (const InterpFrame *Frame = S.Current; Frame; Frame = Frame->Caller) {
-      if (const Function *Func = Frame->getFunction();
-          Func && (Func->isConstructor() || Func->isDestructor()) &&
-          Ptr.block() == Frame->getThis().block()) {
-        return true;
-      }
-    }
-  }
-
   if (!Ptr.isBlockPointer())
     return false;
 
+  // The This pointer is writable in constructors and destructors,
+  // even if isConst() returns true.
+  if (llvm::find(S.InitializingBlocks, Ptr.block()))
+    return true;
+
   const QualType Ty = Ptr.getType();
   const SourceInfo &Loc = S.Current->getSource(OpPC);
   S.FFDiag(Loc, diag::note_constexpr_modify_const_type) << Ty;
@@ -1524,6 +1509,9 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
       return false;
     if (Func->isDestructor() && !CheckDestructor(S, OpPC, ThisPtr))
       return false;
+
+    if (Func->isConstructor() || Func->isDestructor())
+      S.InitializingBlocks.push_back(ThisPtr.block());
   }
 
   if (!Func->isFullyCompiled())
@@ -1550,16 +1538,21 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
   // Note that we cannot assert(CallResult.hasValue()) here since
   // Ret() above only sets the APValue if the curent frame doesn't
   // have a caller set.
-  if (Interpret(S)) {
-    NewFrame.release(); // Frame was delete'd already.
-    assert(S.Current == FrameBefore);
-    return true;
+  bool Success = Interpret(S);
+  // Remove initializing  block again.
+  if (Func->isConstructor() || Func->isDestructor())
+    S.InitializingBlocks.pop_back();
+
+  if (!Success) {
+    // Interpreting the function failed somehow. Reset to
+    // previous state.
+    S.Current = FrameBefore;
+    return false;
   }
 
-  // Interpreting the function failed somehow. Reset to
-  // previous state.
-  S.Current = FrameBefore;
-  return false;
+  NewFrame.release(); // Frame was delete'd already.
+  assert(S.Current == FrameBefore);
+  return true;
 }
 
 bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 08765561985e2..861e4c38049ab 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -190,6 +190,10 @@ class InterpState final : public State, public SourceMapper {
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
 
+  /// List of blocks we're currently running either constructors or destructors
+  /// for.
+  llvm::SmallVector<const Block *> InitializingBlocks;
+
   mutable llvm::BumpPtrAllocator Allocator;
 };
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index e23e84368516a..9092668c67d92 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11537,12 +11537,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       switch (E->getBuiltinCallee()) {
       case Builtin::BI__builtin_elementwise_add_sat:
         ResultElements.push_back(APValue(
-            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : RHS.uadd_sat(RHS),
+            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
                    DestEltTy->isUnsignedIntegerOrEnumerationType())));
         break;
       case Builtin::BI__builtin_elementwise_sub_sat:
         ResultElements.push_back(APValue(
-            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : RHS.usub_sat(RHS),
+            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
                    DestEltTy->isUnsignedIntegerOrEnumerationType())));
         break;
       }
diff --git a/clang/lib/Analysis/CMakeLists.txt b/clang/lib/Analysis/CMakeLists.txt
index 8cd3990db4c3e..0523d92480cb3 100644
--- a/clang/lib/Analysis/CMakeLists.txt
+++ b/clang/lib/Analysis/CMakeLists.txt
@@ -21,6 +21,7 @@ add_clang_library(clangAnalysis
   FixitUtil.cpp
   IntervalPartition.cpp
   IssueHash.cpp
+  LifetimeSafety.cpp
   LiveVariables.cpp
   MacroExpansionContext.cpp
   ObjCNoReturn.cpp
diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp
new file mode 100644
index 0000000000000..1f18952ce96da
--- /dev/null
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -0,0 +1,510 @@
+//===- LifetimeSafety.cpp - C++ Lifetime Safety Analysis -*--------- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Analysis/Analyses/LifetimeSafety.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
+#include "clang/Analysis/Analyses/PostOrderCFGView.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+#include <cstdint>
+
+namespace clang {
+namespace {
+
+/// Represents the storage location being borrowed, e.g., a specific stack
+/// variable.
+/// TODO: Model access paths of other types, e.g., s.field, heap and globals.
+struct AccessPath {
+  const clang::ValueDecl *D;
+
+  AccessPath(const clang::ValueDecl *D) : D(D) {}
+};
+
+/// A generic, type-safe wrapper for an ID, distinguished by its `Tag` type.
+/// Used for giving ID to loans and origins.
+template <typename Tag> struct ID {
+  uint32_t Value = 0;
+
+  bool operator==(const ID<Tag> &Other) const { return Value == Other.Value; }
+  bool operator!=(const ID<Tag> &Other) const { return !(*this == Other); }
+  bool operator<(const ID<Tag> &Other) const { return Value < Other.Value; }
+  ID<Tag> operator++(int) {
+    ID<Tag> Tmp = *this;
+    ++Value;
+    return Tmp;
+  }
+  void Profile(llvm::FoldingSetNodeID &IDBuilder) const {
+    IDBuilder.AddInteger(Value);
+  }
+};
+
+template <typename Tag>
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, ID<Tag> ID) {
+  return OS << ID.Value;
+}
+
+using LoanID = ID<struct LoanTag>;
+using OriginID = ID<struct OriginTag>;
+
+/// Information about a single borrow, or "Loan". A loan is created when a
+/// reference or pointer is created.
+struct Loan {
+  /// TODO: Represent opaque loans.
+  /// TODO: Represent nullptr: loans to no path. Accessing it UB! Currently it
+  /// is represented as empty LoanSet
+  LoanID ID;
+  AccessPath Path;
+  SourceLocation IssueLoc;
+
+  Loan(LoanID id, AccessPath path, SourceLocation loc)
+      : ID(id), Path(path), IssueLoc(loc) {}
+};
+
+/// An Origin is a symbolic identifier that represents the set of possible
+/// loans a pointer-like object could hold at any given time.
+/// TODO: Enhance the origin model to handle complex types, pointer
+/// indirection and reborrowing. The plan is to move from a single origin per
+/// variable/expression to a "list of origins" governed by the Type.
+/// For example, the type 'int**' would have two origins.
+/// See discussion:
+/// https://github.com/llvm/llvm-project/pull/142313/commits/0cd187b01e61b200d92ca0b640789c1586075142#r2137644238
+struct Origin {
+  OriginID ID;
+  /// A pointer to the AST node that this origin represents. This union
+  /// distinguishes between origins from declarations (variables or parameters)
+  /// and origins from expressions.
+  llvm::PointerUnion<const clang::ValueDecl *, const clang::Expr *> Ptr;
+
+  Origin(OriginID ID, const clang::ValueDecl *D) : ID(ID), Ptr(D) {}
+  Origin(OriginID ID, const clang::Expr *E) : ID(ID), Ptr(E) {}
+
+  const clang::ValueDecl *getDecl() const {
+    return Ptr.dyn_cast<const clang::ValueDecl *>();
+  }
+  const clang::Expr *getExpr() const {
+    return Ptr.dyn_cast<const clang::Expr *>();
+  }
+};
+
+/// Manages the creation, storage and retrieval of loans.
+class LoanManager {
+public:
+  LoanManager() = default;
+
+  Loan &addLoan(AccessPath Path, SourceLocation Loc) {
+    AllLoans.emplace_back(getNextLoanID(), Path, Loc);
+    return AllLoans.back();
+  }
+
+  const Loan &getLoan(LoanID ID) const {
+    assert(ID.Value < AllLoans.size());
+    return AllLoans[ID.Value];
+  }
+  llvm::ArrayRef<Loan> getLoans() const { return AllLoans; }
+
+private:
+  LoanID getNextLoanID() { return NextLoanID++; }
+
+  LoanID NextLoanID{0};
+  /// TODO(opt): Profile and evaluate the usefullness of small buffer
+  /// optimisation.
+  llvm::SmallVector<Loan> AllLoans;
+};
+
+/// Manages the creation, storage, and retrieval of origins for pointer-like
+/// variables and expressions.
+class OriginManager {
+public:
+  OriginManager() = default;
+
+  Origin &addOrigin(OriginID ID, const clang::ValueDecl &D) {
+    AllOrigins.emplace_back(ID, &D);
+    return AllOrigins.back();
+  }
+  Origin &addOrigin(OriginID ID, const clang::Expr &E) {
+    AllOrigins.emplace_back(ID, &E);
+    return AllOrigins.back();
+  }
+
+  OriginID get(const Expr &E) {
+    // Origin of DeclRefExpr is that of the declaration it refers to.
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(&E))
+      return get(*DRE->getDecl());
+    auto It = ExprToOriginID.find(&E);
+    // TODO: This should be an assert(It != ExprToOriginID.end()). The current
+    // implementation falls back to getOrCreate to avoid crashing on
+    // yet-unhandled pointer expressions, creating an empty origin for them.
+    if (It == ExprToOriginID.end())
+      return getOrCreate(E);
+
+    return It->second;
+  }
+
+  OriginID get(const ValueDecl &D) {
+    auto It = DeclToOriginID.find(&D);
+    // TODO: This should be an assert(It != DeclToOriginID.end()). The current
+    // implementation falls back to getOrCreate to avoid crashing on
+    // yet-unhandled pointer expressions, creating an empty origin for them.
+    if (It == DeclToOriginID.end())
+      return getOrCreate(D);
+
+    return It->second;
+  }
+
+  OriginID getOrCreate(const Expr &E) {
+    auto It = ExprToOriginID.find(&E);
+    if (It != ExprToOriginID.end())
+      return It->second;
+
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(&E)) {
+      // Origin of DeclRefExpr is that of the declaration it refers to.
+      return getOrCreate(*DRE->getDecl());
+    }
+    OriginID NewID = getNextOriginID();
+    addOrigin(NewID, E);
+    ExprToOriginID[&E] = NewID;
+    return NewID;
+  }
+
+  const Origin &getOrigin(OriginID ID) const {
+    assert(ID.Value < AllOrigins.size());
+    return AllOrigins[ID.Value];
+  }
+
+  llvm::ArrayRef<Origin> getOrigins() const { return AllOrigins; }
+
+  OriginID getOrCreate(const ValueDecl &D) {
+    auto It = DeclToOriginID.find(&D);
+    if (It != DeclToOriginID.end())
+      return It->second;
+    OriginID NewID = getNextOriginID();
+    addOrigin(NewID, D);
+    DeclToOriginID[&D] = NewID;
+    return NewID;
+  }
+
+private:
+  OriginID getNextOriginID() { return NextOriginID++; }
+
+  OriginID NextOriginID{0};
+  /// TODO(opt): Profile and evaluate the usefullness of small buffer
+  /// optimisation.
+  llvm::SmallVector<Origin> AllOrigins;
+  llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
+  llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
+};
+
+/// An abstract base class for a single, atomic lifetime-relevant event.
+class Fact {
+
+public:
+  enum class Kind : uint8_t {
+    /// A new loan is issued from a borrow expression (e.g., &x).
+    Issue,
+    /// A loan expires as its underlying storage is freed (e.g., variable goes
+    /// out of scope).
+    Expire,
+    /// An origin is propagated from a source to a destination (e.g., p = q).
+    AssignOrigin,
+    /// An origin escapes the function by flowing into the return value.
+    ReturnOfOrigin
+  };
+
+private:
+  Kind K;
+
+protected:
+  Fact(Kind K) : K(K) {}
+
+public:
+  virtual ~Fact() = default;
+  Kind getKind() const { return K; }
+
+  template <typename T> const T *getAs() const {
+    if (T::classof(this))
+      return static_cast<const T *>(this);
+    return nullptr;
+  }
+
+  virtual void dump(llvm::raw_ostream &OS) const {
+    OS << "Fact (Kind: " << static_cast<int>(K) << ")\n";
+  }
+};
+
+class IssueFact : public Fact {
+  LoanID LID;
+  OriginID OID;
+
+public:
+  static bool classof(const Fact *F) { return F->getKind() == Kind::Issue; }
+
+  IssueFact(LoanID LID, OriginID OID) : Fact(Kind::Issue), LID(LID), OID(OID) {}
+  LoanID getLoanID() const { return LID; }
+  OriginID getOriginID() const { return OID; }
+  void dump(llvm::raw_ostream &OS) const override {
+    OS << "Issue (LoanID: " << getLoanID() << ", OriginID: " << getOriginID()
+       << ")\n";
+  }
+};
+
+class ExpireFact : public Fact {
+  LoanID LID;
+
+public:
+  static bool classof(const Fact *F) { return F->getKind() == Kind::Expire; }
+
+  ExpireFact(LoanID LID) : Fact(Kind::Expire), LID(LID) {}
+  LoanID getLoanID() const { return LID; }
+  void dump(llvm::raw_ostream &OS) const override {
+    OS << "Expire (LoanID: " << getLoanID() << ")\n";
+  }
+};
+
+class AssignOriginFact : public Fact {
+  OriginID OIDDest;
+  OriginID OIDSrc;
+
+public:
+  static bool classof(const Fact *F) {
+    return F->getKind() == Kind::AssignOrigin;
+  }
+
+  AssignOriginFact(OriginID OIDDest, OriginID OIDSrc)
+      : Fact(Kind::AssignOrigin), OIDDest(OIDDest), OIDSrc(OIDSrc) {}
+  OriginID getDestOriginID() const { return OIDDest; }
+  OriginID getSrcOriginID() const { return OIDSrc; }
+  void dump(llvm::raw_ostream &OS) const override {
+    OS << "AssignOrigin (DestID: " << getDestOriginID()
+       << ", SrcID: " << getSrcOriginID() << ")\n";
+  }
+};
+
+class ReturnOfOriginFact : public Fact {
+  OriginID OID;
+
+public:
+  static bool classof(const Fact *F) {
+    return F->getKind() == Kind::ReturnOfOrigin;
+  }
+
+  ReturnOfOriginFact(OriginID OID) : Fact(Kind::ReturnOfOrigin), OID(OID) {}
+  OriginID getReturnedOriginID() const { return OID; }
+  void dump(llvm::raw_ostream &OS) const override {
+    OS << "ReturnOfOrigin (OriginID: " << getReturnedOriginID() << ")\n";
+  }
+};
+
+class FactManager {
+public:
+  llvm::ArrayRef<const Fact *> getFacts(const CFGBlock *B) const {
+    auto It = BlockToFactsMap.find(B);
+    if (It != BlockToFactsMap.end())
+      return It->second;
+    return {};
+  }
+
+  void addBlockFacts(const CFGBlock *B, llvm::ArrayRef<Fact *> NewFacts) {
+    if (!NewFacts.empty())
+      BlockToFactsMap[B].assign(NewFacts.begin(), NewFacts.end());
+  }
+
+  template <typename FactType, typename... Args>
+  FactType *createFact(Args &&...args) {
+    void *Mem = FactAllocator.Allocate<FactType>();
+    return new (Mem) FactType(std::forward<Args>(args)...);
+  }
+
+  void dump(const CFG &Cfg, AnalysisDeclContext &AC) const {
+    llvm::dbgs() << "==========================================\n";
+    llvm::dbgs() << "       Lifetime Analysis Facts:\n";
+    llvm::dbgs() << "==========================================\n";
+    if (const Decl *D = AC.getDecl())
+      if (const auto *ND = dyn_cast<NamedDecl>(D))
+        llvm::dbgs() << "Function: " << ND->getQualifiedNameAsString() << "\n";
+    // Print blocks in the order as they appear in code for a stable ordering.
+    for (const CFGBlock *B : *AC.getAnalysis<PostOrderCFGView>()) {
+      llvm::dbgs() << "  Block B" << B->getBlockID() << ":\n";
+      auto It = BlockToFactsMap.find(B);
+      if (It != BlockToFactsMap.end()) {
+        for (const Fact *F : It->second) {
+          llvm::dbgs() << "    ";
+          F->dump(llvm::dbgs());
+        }
+      }
+      llvm::dbgs() << "  End of Block\n";
+    }
+  }
+
+  LoanManager &getLoanMgr() { return LoanMgr; }
+  OriginManager &getOriginMgr() { return OriginMgr; }
+
+private:
+  LoanManager LoanMgr;
+  OriginManager OriginMgr;
+  llvm::DenseMap<const clang::CFGBlock *, llvm::SmallVector<const Fact *>>
+      BlockToFactsMap;
+  llvm::BumpPtrAllocator FactAllocator;
+};
+
+class FactGenerator : public ConstStmtVisitor<FactGenerator> {
+
+public:
+  FactGenerator(FactManager &FactMgr, AnalysisDeclContext &AC)
+      : FactMgr(FactMgr), AC(AC) {}
+
+  void run() {
+    llvm::TimeTraceScope TimeProfile("FactGenerator");
+    // Iterate through the CFG blocks in reverse post-order to ensure that
+    // initializations and destructions are processed in the correct sequence.
+    for (const CFGBlock *Block : *AC.getAnalysis<PostOrderCFGView>()) {
+      CurrentBlockFacts.clear();
+      for (unsigned I = 0; I < Block->size(); ++I) {
+        const CFGElement &Element = Block->Elements[I];
+        if (std::optional<CFGStmt> CS = Element.getAs<CFGStmt>())
+          Visit(CS->getStmt());
+        else if (std::optional<CFGAutomaticObjDtor> DtorOpt =
+                     Element.getAs<CFGAutomaticObjDtor>())
+          handleDestructor(*DtorOpt);
+      }
+      FactMgr.addBlockFacts(Block, CurrentBlockFacts);
+    }
+  }
+
+  void VisitDeclStmt(const DeclStmt *DS) {
+    for (const Decl *D : DS->decls())
+      if (const auto *VD = dyn_cast<VarDecl>(D))
+        if (hasOrigin(VD->getType()))
+          if (const Expr *InitExpr = VD->getInit())
+            addAssignOriginFact(*VD, *InitExpr);
+  }
+
+  void VisitCXXNullPtrLiteralExpr(const CXXNullPtrLiteralExpr *N) {
+    /// TODO: Handle nullptr expr as a special 'null' loan. Uninitialized
+    /// pointers can use the same type of loan.
+    FactMgr.getOriginMgr().getOrCreate(*N);
+  }
+
+  void VisitImplicitCastExpr(const ImplicitCastExpr *ICE) {
+    if (!hasOrigin(ICE->getType()))
+      return;
+    Visit(ICE->getSubExpr());
+    // An ImplicitCastExpr node itself gets an origin, which flows from the
+    // origin of its sub-expression (after stripping its own parens/casts).
+    // TODO: Consider if this is actually useful in practice. Alternatively, we
+    // could directly use the sub-expression's OriginID instead of creating a
+    // new one.
+    addAssignOriginFact(*ICE, *ICE->getSubExpr());
+  }
+
+  void VisitUnaryOperator(const UnaryOperator *UO) {
+    if (UO->getOpcode() == UO_AddrOf) {
+      const Expr *SubExpr = UO->getSubExpr();
+      if (const auto *DRE = dyn_cast<DeclRefExpr>(SubExpr)) {
+        if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+          // Check if it's a local variable.
+          if (VD->hasLocalStorage()) {
+            OriginID OID = FactMgr.getOriginMgr().getOrCreate(*UO);
+            AccessPath AddrOfLocalVarPath(VD);
+            const Loan &L = FactMgr.getLoanMgr().addLoan(AddrOfLocalVarPath,
+                                                         UO->getOperatorLoc());
+            CurrentBlockFacts.push_back(
+                FactMgr.createFact<IssueFact>(L.ID, OID));
+          }
+        }
+      }
+    }
+  }
+
+  void VisitReturnStmt(const ReturnStmt *RS) {
+    if (const Expr *RetExpr = RS->getRetValue()) {
+      if (hasOrigin(RetExpr->getType())) {
+        OriginID OID = FactMgr.getOriginMgr().getOrCreate(*RetExpr);
+        CurrentBlockFacts.push_back(
+            FactMgr.createFact<ReturnOfOriginFact>(OID));
+      }
+    }
+  }
+
+  void VisitBinaryOperator(const BinaryOperator *BO) {
+    if (BO->isAssignmentOp()) {
+      const Expr *LHSExpr = BO->getLHS();
+      const Expr *RHSExpr = BO->getRHS();
+
+      // We are interested in assignments like `ptr1 = ptr2` or `ptr = &var`
+      // LHS must be a pointer/reference type that can be an origin.
+      // RHS must also represent an origin (either another pointer/ref or an
+      // address-of).
+      if (const auto *DRE_LHS = dyn_cast<DeclRefExpr>(LHSExpr))
+        if (const auto *VD_LHS =
+                dyn_cast<ValueDecl>(DRE_LHS->getDecl()->getCanonicalDecl());
+            VD_LHS && hasOrigin(VD_LHS->getType()))
+          addAssignOriginFact(*VD_LHS, *RHSExpr);
+    }
+  }
+
+private:
+  // Check if a type has an origin.
+  bool hasOrigin(QualType QT) { return QT->isPointerOrReferenceType(); }
+
+  template <typename Destination, typename Source>
+  void addAssignOriginFact(const Destination &D, const Source &S) {
+    OriginID DestOID = FactMgr.getOriginMgr().getOrCreate(D);
+    OriginID SrcOID = FactMgr.getOriginMgr().get(S);
+    CurrentBlockFacts.push_back(
+        FactMgr.createFact<AssignOriginFact>(DestOID, SrcOID));
+  }
+
+  void handleDestructor(const CFGAutomaticObjDtor &DtorOpt) {
+    /// TODO: Also handle trivial destructors (e.g., for `int`
+    /// variables) which will never have a CFGAutomaticObjDtor node.
+    /// TODO: Handle loans to temporaries.
+    /// TODO: Consider using clang::CFG::BuildOptions::AddLifetime to reuse the
+    /// lifetime ends.
+    const VarDecl *DestructedVD = DtorOpt.getVarDecl();
+    if (!DestructedVD)
+      return;
+    // Iterate through all loans to see if any expire.
+    /// TODO(opt): Do better than a linear search to find loans associated with
+    /// 'DestructedVD'.
+    for (const Loan &L : FactMgr.getLoanMgr().getLoans()) {
+      const AccessPath &LoanPath = L.Path;
+      // Check if the loan is for a stack variable and if that variable
+      // is the one being destructed.
+      if (LoanPath.D == DestructedVD)
+        CurrentBlockFacts.push_back(FactMgr.createFact<ExpireFact>(L.ID));
+    }
+  }
+
+  FactManager &FactMgr;
+  AnalysisDeclContext &AC;
+  llvm::SmallVector<Fact *> CurrentBlockFacts;
+};
+
+// ========================================================================= //
+//  TODO: Run dataflow analysis to propagate loans, analyse and error reporting.
+// ========================================================================= //
+} // anonymous namespace
+
+void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
+                               AnalysisDeclContext &AC) {
+  llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
+  DEBUG_WITH_TYPE("PrintCFG", Cfg.dump(AC.getASTContext().getLangOpts(),
+                                       /*ShowColors=*/true));
+  FactManager FactMgr;
+  FactGenerator FactGen(FactMgr, AC);
+  FactGen.run();
+  DEBUG_WITH_TYPE("LifetimeFacts", FactMgr.dump(Cfg, AC));
+}
+} // namespace clang
diff --git a/clang/lib/Analysis/UninitializedValues.cpp b/clang/lib/Analysis/UninitializedValues.cpp
index b2a68b6c39a7e..8c9cf8dac79ed 100644
--- a/clang/lib/Analysis/UninitializedValues.cpp
+++ b/clang/lib/Analysis/UninitializedValues.cpp
@@ -161,8 +161,7 @@ class CFGBlockValues {
 
   ValueVector::reference operator[](const VarDecl *vd);
 
-  Value getValue(const CFGBlock *block, const CFGBlock *dstBlock,
-                 const VarDecl *vd) {
+  Value getValue(const CFGBlock *block, const VarDecl *vd) {
     std::optional<unsigned> idx = declToIndex.getValueIndex(vd);
     return getValueVector(block)[*idx];
   }
@@ -589,12 +588,12 @@ class TransferFunctions : public StmtVisitor<TransferFunctions> {
         if (!Pred)
           continue;
 
-        Value AtPredExit = vals.getValue(Pred, B, vd);
+        Value AtPredExit = vals.getValue(Pred, vd);
         if (AtPredExit == Initialized)
           // This block initializes the variable.
           continue;
         if (AtPredExit == MayUninitialized &&
-            vals.getValue(B, nullptr, vd) == Uninitialized) {
+            vals.getValue(B, vd) == Uninitialized) {
           // This block declares the variable (uninitialized), and is reachable
           // from a block that initializes the variable. We can't guarantee to
           // give an earlier location for the diagnostic (and it appears that
@@ -625,6 +624,8 @@ class TransferFunctions : public StmtVisitor<TransferFunctions> {
     // Scan the frontier, looking for blocks where the variable was
     // uninitialized.
     for (const auto *Block : cfg) {
+      if (vals.getValue(Block, vd) != Uninitialized)
+        continue;
       unsigned BlockID = Block->getBlockID();
       const Stmt *Term = Block->getTerminatorStmt();
       if (SuccsVisited[BlockID] && SuccsVisited[BlockID] < Block->succ_size() &&
@@ -635,8 +636,7 @@ class TransferFunctions : public StmtVisitor<TransferFunctions> {
         for (CFGBlock::const_succ_iterator I = Block->succ_begin(),
              E = Block->succ_end(); I != E; ++I) {
           const CFGBlock *Succ = *I;
-          if (Succ && SuccsVisited[Succ->getBlockID()] >= Succ->succ_size() &&
-              vals.getValue(Block, Succ, vd) == Uninitialized) {
+          if (Succ && SuccsVisited[Succ->getBlockID()] >= Succ->succ_size()) {
             // Switch cases are a special case: report the label to the caller
             // as the 'terminator', not the switch statement itself. Suppress
             // situations where no label matched: we can't be sure that's
@@ -675,8 +675,11 @@ void TransferFunctions::reportUse(const Expr *ex, const VarDecl *vd) {
 
 void TransferFunctions::reportConstRefUse(const Expr *ex, const VarDecl *vd) {
   Value v = vals[vd];
-  if (isAlwaysUninit(v))
-    handler.handleConstRefUseOfUninitVariable(vd, getUninitUse(ex, vd, v));
+  if (isAlwaysUninit(v)) {
+    auto use = getUninitUse(ex, vd, v);
+    use.setConstRefUse();
+    handler.handleUseOfUninitVariable(vd, use);
+  }
 }
 
 void TransferFunctions::VisitObjCForCollectionStmt(ObjCForCollectionStmt *FS) {
@@ -891,12 +894,6 @@ struct PruneBlocksHandler : public UninitVariablesHandler {
     hadAnyUse = true;
   }
 
-  void handleConstRefUseOfUninitVariable(const VarDecl *vd,
-                                         const UninitUse &use) override {
-    hadUse[currentBlock] = true;
-    hadAnyUse = true;
-  }
-
   /// Called when the uninitialized variable analysis detects the
   /// idiom 'int x = x'.  All other uses of 'x' within the initializer
   /// are handled by handleUseOfUninitVariable.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 0b33f6c7d03b7..d0b6b6918f0e2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -364,6 +364,20 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
   }
 
+  /// Create a cir.complex.real_ptr operation that derives a pointer to the real
+  /// part of the complex value pointed to by the specified pointer value.
+  mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) {
+    auto srcPtrTy = mlir::cast<cir::PointerType>(value.getType());
+    auto srcComplexTy = mlir::cast<cir::ComplexType>(srcPtrTy.getPointee());
+    return create<cir::ComplexRealPtrOp>(
+        loc, getPointerTo(srcComplexTy.getElementType()), value);
+  }
+
+  Address createComplexRealPtr(mlir::Location loc, Address addr) {
+    return Address{createComplexRealPtr(loc, addr.getPointer()),
+                   addr.getAlignment()};
+  }
+
   /// Create a cir.ptr_stride operation to get access to an array element.
   /// \p idx is the index of the element to access, \p shouldDecay is true if
   /// the result should decay to a pointer to the element type.
@@ -394,6 +408,15 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return createGlobal(module, loc, uniqueName, type, linkage);
   }
 
+  mlir::Value createSetBitfield(mlir::Location loc, mlir::Type resultType,
+                                mlir::Value dstAddr, mlir::Type storageType,
+                                mlir::Value src, const CIRGenBitFieldInfo &info,
+                                bool isLvalueVolatile, bool useVolatile) {
+    return create<cir::SetBitfieldOp>(loc, resultType, dstAddr, storageType,
+                                      src, info.name, info.size, info.offset,
+                                      info.isSigned, isLvalueVolatile);
+  }
+
   mlir::Value createGetBitfield(mlir::Location loc, mlir::Type resultType,
                                 mlir::Value addr, mlir::Type storageType,
                                 const CIRGenBitFieldInfo &info,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index da8166a596d42..cc4a615dc392e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -117,6 +117,75 @@ static void emitMemberInitializer(CIRGenFunction &cgf,
   cgf.emitInitializerForField(field, lhs, memberInit->getInit());
 }
 
+static bool isInitializerOfDynamicClass(const CXXCtorInitializer *baseInit) {
+  const Type *baseType = baseInit->getBaseClass();
+  const auto *baseClassDecl =
+      cast<CXXRecordDecl>(baseType->castAs<RecordType>()->getDecl());
+  return baseClassDecl->isDynamicClass();
+}
+
+/// Gets the address of a direct base class within a complete object.
+/// This should only be used for (1) non-virtual bases or (2) virtual bases
+/// when the type is known to be complete (e.g. in complete destructors).
+///
+/// The object pointed to by 'thisAddr' is assumed to be non-null.
+Address CIRGenFunction::getAddressOfDirectBaseInCompleteClass(
+    mlir::Location loc, Address thisAddr, const CXXRecordDecl *derived,
+    const CXXRecordDecl *base, bool baseIsVirtual) {
+  // 'thisAddr' must be a pointer (in some address space) to Derived.
+  assert(thisAddr.getElementType() == convertType(derived));
+
+  // Compute the offset of the virtual base.
+  CharUnits offset;
+  const ASTRecordLayout &layout = getContext().getASTRecordLayout(derived);
+  if (baseIsVirtual)
+    offset = layout.getVBaseClassOffset(base);
+  else
+    offset = layout.getBaseClassOffset(base);
+
+  return builder.createBaseClassAddr(loc, thisAddr, convertType(base),
+                                     offset.getQuantity(),
+                                     /*assumeNotNull=*/true);
+}
+
+void CIRGenFunction::emitBaseInitializer(mlir::Location loc,
+                                         const CXXRecordDecl *classDecl,
+                                         CXXCtorInitializer *baseInit) {
+  assert(curFuncDecl && "loading 'this' without a func declaration?");
+  assert(isa<CXXMethodDecl>(curFuncDecl));
+
+  assert(baseInit->isBaseInitializer() && "Must have base initializer!");
+
+  Address thisPtr = loadCXXThisAddress();
+
+  const Type *baseType = baseInit->getBaseClass();
+  const auto *baseClassDecl =
+      cast<CXXRecordDecl>(baseType->castAs<RecordType>()->getDecl());
+
+  bool isBaseVirtual = baseInit->isBaseVirtual();
+
+  // If the initializer for the base (other than the constructor
+  // itself) accesses 'this' in any way, we need to initialize the
+  // vtables.
+  if (classDecl->isDynamicClass()) {
+    cgm.errorNYI(loc, "emitBaseInitializer: dynamic class");
+    return;
+  }
+
+  // We can pretend to be a complete class because it only matters for
+  // virtual bases, and we only do virtual bases for complete ctors.
+  Address v = getAddressOfDirectBaseInCompleteClass(
+      loc, thisPtr, classDecl, baseClassDecl, isBaseVirtual);
+  assert(!cir::MissingFeatures::aggValueSlotGC());
+  AggValueSlot aggSlot = AggValueSlot::forAddr(
+      v, Qualifiers(), AggValueSlot::IsDestructed, AggValueSlot::IsNotAliased,
+      getOverlapForBaseInit(classDecl, baseClassDecl, isBaseVirtual));
+
+  emitAggExpr(baseInit->getInit(), aggSlot);
+
+  assert(!cir::MissingFeatures::requiresCleanups());
+}
+
 /// This routine generates necessary code to initialize base classes and
 /// non-static data members belonging to this constructor.
 void CIRGenFunction::emitCtorPrologue(const CXXConstructorDecl *cd,
@@ -154,12 +223,29 @@ void CIRGenFunction::emitCtorPrologue(const CXXConstructorDecl *cd,
     return;
   }
 
-  if ((*b)->isBaseInitializer()) {
+  const mlir::Value oldThisValue = cxxThisValue;
+  if (!constructVBases && (*b)->isBaseInitializer() && (*b)->isBaseVirtual()) {
     cgm.errorNYI(cd->getSourceRange(),
-                 "emitCtorPrologue: non-virtual base initializer");
+                 "emitCtorPrologue: virtual base initializer");
     return;
   }
 
+  // Handle non-virtual base initializers.
+  for (; b != e && (*b)->isBaseInitializer(); b++) {
+    assert(!(*b)->isBaseVirtual());
+
+    if (cgm.getCodeGenOpts().StrictVTablePointers &&
+        cgm.getCodeGenOpts().OptimizationLevel > 0 &&
+        isInitializerOfDynamicClass(*b)) {
+      cgm.errorNYI(cd->getSourceRange(),
+                   "emitCtorPrologue: strict vtable pointers");
+      return;
+    }
+    emitBaseInitializer(getLoc(cd->getBeginLoc()), classDecl, *b);
+  }
+
+  cxxThisValue = oldThisValue;
+
   if (classDecl->isDynamicClass()) {
     cgm.errorNYI(cd->getSourceRange(),
                  "emitCtorPrologue: initialize vtable pointers");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 300ba7a456e4b..b1d6b8047a0ab 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -224,6 +224,10 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       return;
     }
 
+    assert(dst.isBitField() && "Unknown LValue type");
+    emitStoreThroughBitfieldLValue(src, dst);
+    return;
+
     cgm.errorNYI(dst.getPointer().getLoc(),
                  "emitStoreThroughLValue: non-simple lvalue");
     return;
@@ -321,9 +325,21 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
 mlir::Value CIRGenFunction::emitStoreThroughBitfieldLValue(RValue src,
                                                            LValue dst) {
-  assert(!cir::MissingFeatures::bitfields());
-  cgm.errorNYI("bitfields");
-  return {};
+
+  assert(!cir::MissingFeatures::armComputeVolatileBitfields());
+
+  const CIRGenBitFieldInfo &info = dst.getBitFieldInfo();
+  mlir::Type resLTy = convertTypeForMem(dst.getType());
+  Address ptr = dst.getBitFieldAddress();
+
+  assert(!cir::MissingFeatures::armComputeVolatileBitfields());
+  const bool useVolatile = false;
+
+  mlir::Value dstAddr = dst.getAddress().getPointer();
+
+  return builder.createSetBitfield(dstAddr.getLoc(), resLTy, dstAddr,
+                                   ptr.getElementType(), src.getValue(), info,
+                                   dst.isVolatileQualified(), useVolatile);
 }
 
 RValue CIRGenFunction::emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc) {
@@ -621,8 +637,30 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) {
   }
   case UO_Real:
   case UO_Imag: {
-    cgm.errorNYI(e->getSourceRange(), "UnaryOp real/imag");
-    return LValue();
+    if (op == UO_Imag) {
+      cgm.errorNYI(e->getSourceRange(), "UnaryOp real/imag");
+      return LValue();
+    }
+
+    LValue lv = emitLValue(e->getSubExpr());
+    assert(lv.isSimple() && "real/imag on non-ordinary l-value");
+
+    // __real is valid on scalars. This is a faster way of testing that.
+    // __imag can only produce an rvalue on scalars.
+    if (e->getOpcode() == UO_Real &&
+        !mlir::isa<cir::ComplexType>(lv.getAddress().getElementType())) {
+      assert(e->getSubExpr()->getType()->isArithmeticType());
+      return lv;
+    }
+
+    QualType exprTy = getContext().getCanonicalType(e->getSubExpr()->getType());
+    QualType elemTy = exprTy->castAs<clang::ComplexType>()->getElementType();
+    mlir::Location loc = getLoc(e->getExprLoc());
+    Address component = builder.createComplexRealPtr(loc, lv.getAddress());
+    assert(!cir::MissingFeatures::opTBAA());
+    LValue elemLV = makeAddrLValue(component, elemTy);
+    elemLV.getQuals().addQualifiers(lv.getQuals());
+    return elemLV;
   }
   case UO_PreInc:
   case UO_PreDec: {
@@ -1062,11 +1100,10 @@ LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
     LValue lv = emitLValue(e->getLHS());
 
     SourceLocRAIIObject loc{*this, getLoc(e->getSourceRange())};
-    if (lv.isBitField()) {
-      cgm.errorNYI(e->getSourceRange(), "bitfields");
-      return {};
-    }
-    emitStoreThroughLValue(rv, lv);
+    if (lv.isBitField())
+      emitStoreThroughBitfieldLValue(rv, lv);
+    else
+      emitStoreThroughLValue(rv, lv);
 
     if (getLangOpts().OpenMP) {
       cgm.errorNYI(e->getSourceRange(), "openmp");
@@ -1578,10 +1615,15 @@ void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
     delegating = true;
     break;
   case CXXConstructionKind::VirtualBase:
-  case CXXConstructionKind::NonVirtualBase:
+    // This should just set 'forVirtualBase' to true and fall through, but
+    // virtual base class support is otherwise missing, so this needs to wait
+    // until it can be tested.
     cgm.errorNYI(e->getSourceRange(),
-                 "emitCXXConstructExpr: other construction kind");
+                 "emitCXXConstructExpr: virtual base constructor");
     return;
+  case CXXConstructionKind::NonVirtualBase:
+    type = Ctor_Base;
+    break;
   }
 
   emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index ffe1b701b244e..0d12c5c3edded 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -16,6 +16,7 @@
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 
 #include "clang/AST/Expr.h"
+#include "clang/AST/RecordLayout.h"
 #include "clang/AST/StmtVisitor.h"
 #include <cstdint>
 
@@ -362,6 +363,28 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
       "visitCXXParenListOrInitListExpr Record or VariableSizeArray type");
 }
 
+// TODO(cir): This could be shared with classic codegen.
+AggValueSlot::Overlap_t CIRGenFunction::getOverlapForBaseInit(
+    const CXXRecordDecl *rd, const CXXRecordDecl *baseRD, bool isVirtual) {
+  // If the most-derived object is a field declared with [[no_unique_address]],
+  // the tail padding of any virtual base could be reused for other subobjects
+  // of that field's class.
+  if (isVirtual)
+    return AggValueSlot::MayOverlap;
+
+  // If the base class is laid out entirely within the nvsize of the derived
+  // class, its tail padding cannot yet be initialized, so we can issue
+  // stores at the full width of the base class.
+  const ASTRecordLayout &layout = getContext().getASTRecordLayout(rd);
+  if (layout.getBaseClassOffset(baseRD) +
+          getContext().getASTRecordLayout(baseRD).getSize() <=
+      layout.getNonVirtualSize())
+    return AggValueSlot::DoesNotOverlap;
+
+  // The tail padding may contain values we need to preserve.
+  return AggValueSlot::MayOverlap;
+}
+
 void CIRGenFunction::emitAggExpr(const Expr *e, AggValueSlot slot) {
   AggExprEmitter(*this, slot).Visit(const_cast<Expr *>(e));
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 84fad959ebf49..cb83691b4452d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -57,6 +57,55 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
   mlir::Value
   VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e);
   mlir::Value VisitUnaryDeref(const Expr *e);
+
+  struct BinOpInfo {
+    mlir::Location loc;
+    mlir::Value lhs{};
+    mlir::Value rhs{};
+    QualType ty{}; // Computation Type.
+    FPOptions fpFeatures{};
+  };
+
+  BinOpInfo emitBinOps(const BinaryOperator *e,
+                       QualType promotionTy = QualType());
+
+  mlir::Value emitPromoted(const Expr *e, QualType promotionTy);
+
+  mlir::Value emitPromotedComplexOperand(const Expr *e, QualType promotionTy);
+
+  mlir::Value emitBinAdd(const BinOpInfo &op);
+
+  QualType getPromotionType(QualType ty, bool isDivOpCode = false) {
+    if (auto *complexTy = ty->getAs<ComplexType>()) {
+      QualType elementTy = complexTy->getElementType();
+      if (isDivOpCode && elementTy->isFloatingType() &&
+          cgf.getLangOpts().getComplexRange() ==
+              LangOptions::ComplexRangeKind::CX_Promoted) {
+        cgf.cgm.errorNYI("HigherPrecisionTypeForComplexArithmetic");
+        return QualType();
+      }
+
+      if (elementTy.UseExcessPrecision(cgf.getContext()))
+        return cgf.getContext().getComplexType(cgf.getContext().FloatTy);
+    }
+
+    if (ty.UseExcessPrecision(cgf.getContext()))
+      return cgf.getContext().FloatTy;
+    return QualType();
+  }
+
+#define HANDLEBINOP(OP)                                                        \
+  mlir::Value VisitBin##OP(const BinaryOperator *e) {                          \
+    QualType promotionTy = getPromotionType(                                   \
+        e->getType(), e->getOpcode() == BinaryOperatorKind::BO_Div);           \
+    mlir::Value result = emitBin##OP(emitBinOps(e, promotionTy));              \
+    if (!promotionTy.isNull())                                                 \
+      cgf.cgm.errorNYI("Binop emitUnPromotedValue");                           \
+    return result;                                                             \
+  }
+
+  HANDLEBINOP(Add)
+#undef HANDLEBINOP
 };
 } // namespace
 
@@ -242,12 +291,8 @@ mlir::Value ComplexExprEmitter::VisitInitListExpr(const InitListExpr *e) {
   }
 
   assert(e->getNumInits() == 0 && "Unexpected number of inits");
-  QualType complexElemTy =
-      e->getType()->castAs<clang::ComplexType>()->getElementType();
-  mlir::Type complexElemLLVMTy = cgf.convertType(complexElemTy);
-  mlir::TypedAttr defaultValue = builder.getZeroInitAttr(complexElemLLVMTy);
-  auto complexAttr = cir::ConstComplexAttr::get(defaultValue, defaultValue);
-  return builder.create<cir::ConstantOp>(loc, complexAttr);
+  mlir::Type complexTy = cgf.convertType(e->getType());
+  return builder.getNullValue(complexTy, loc);
 }
 
 mlir::Value
@@ -291,6 +336,60 @@ mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) {
   return emitLoadOfLValue(e);
 }
 
+mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e,
+                                             QualType promotionTy) {
+  e = e->IgnoreParens();
+  if (const auto *bo = dyn_cast<BinaryOperator>(e)) {
+    switch (bo->getOpcode()) {
+#define HANDLE_BINOP(OP)                                                       \
+  case BO_##OP:                                                                \
+    return emitBin##OP(emitBinOps(bo, promotionTy));
+      HANDLE_BINOP(Add)
+#undef HANDLE_BINOP
+    default:
+      break;
+    }
+  } else if (isa<UnaryOperator>(e)) {
+    cgf.cgm.errorNYI("emitPromoted UnaryOperator");
+    return {};
+  }
+
+  mlir::Value result = Visit(const_cast<Expr *>(e));
+  if (!promotionTy.isNull())
+    cgf.cgm.errorNYI("emitPromoted emitPromotedValue");
+
+  return result;
+}
+
+mlir::Value
+ComplexExprEmitter::emitPromotedComplexOperand(const Expr *e,
+                                               QualType promotionTy) {
+  if (e->getType()->isAnyComplexType()) {
+    if (!promotionTy.isNull())
+      return cgf.emitPromotedComplexExpr(e, promotionTy);
+    return Visit(const_cast<Expr *>(e));
+  }
+
+  cgf.cgm.errorNYI("emitPromotedComplexOperand non-complex type");
+  return {};
+}
+
+ComplexExprEmitter::BinOpInfo
+ComplexExprEmitter::emitBinOps(const BinaryOperator *e, QualType promotionTy) {
+  BinOpInfo binOpInfo{cgf.getLoc(e->getExprLoc())};
+  binOpInfo.lhs = emitPromotedComplexOperand(e->getLHS(), promotionTy);
+  binOpInfo.rhs = emitPromotedComplexOperand(e->getRHS(), promotionTy);
+  binOpInfo.ty = promotionTy.isNull() ? e->getType() : promotionTy;
+  binOpInfo.fpFeatures = e->getFPFeaturesInEffect(cgf.getLangOpts());
+  return binOpInfo;
+}
+
+mlir::Value ComplexExprEmitter::emitBinAdd(const BinOpInfo &op) {
+  assert(!cir::MissingFeatures::fastMathFlags());
+  assert(!cir::MissingFeatures::cgFPOptionsRAII());
+  return builder.create<cir::ComplexAddOp>(op.loc, op.lhs, op.rhs);
+}
+
 LValue CIRGenFunction::emitComplexAssignmentLValue(const BinaryOperator *e) {
   assert(e->getOpcode() == BO_Assign && "Expected assign op");
 
@@ -313,3 +412,8 @@ void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v,
                                         LValue dest, bool isInit) {
   ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit);
 }
+
+mlir::Value CIRGenFunction::emitPromotedComplexExpr(const Expr *e,
+                                                    QualType promotionType) {
+  return ComplexExprEmitter(*this).emitPromoted(e, promotionType);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 76353bae68e21..5feb5fc94d983 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -562,6 +562,19 @@ class CIRGenFunction : public CIRGenTypeCache {
   }
   Address loadCXXThisAddress();
 
+  /// Convert the given pointer to a complete class to the given direct base.
+  Address getAddressOfDirectBaseInCompleteClass(mlir::Location loc,
+                                                Address value,
+                                                const CXXRecordDecl *derived,
+                                                const CXXRecordDecl *base,
+                                                bool baseIsVirtual);
+
+  /// Determine whether a base class initialization may overlap some other
+  /// object.
+  AggValueSlot::Overlap_t getOverlapForBaseInit(const CXXRecordDecl *rd,
+                                                const CXXRecordDecl *baseRD,
+                                                bool isVirtual);
+
   /// Get an appropriate 'undef' rvalue for the given type.
   /// TODO: What's the equivalent for MLIR? Currently we're only using this for
   /// void types so it just returns RValue::get(nullptr) but it'll need
@@ -762,6 +775,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitAutoVarCleanups(const AutoVarEmission &emission);
   void emitAutoVarInit(const AutoVarEmission &emission);
 
+  void emitBaseInitializer(mlir::Location loc, const CXXRecordDecl *classDecl,
+                           CXXCtorInitializer *baseInit);
+
   LValue emitBinaryOperatorLValue(const BinaryOperator *e);
 
   mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s);
@@ -886,6 +902,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitInitializerForField(clang::FieldDecl *field, LValue lhs,
                                clang::Expr *init);
 
+  mlir::Value emitPromotedComplexExpr(const Expr *e, QualType promotionType);
+
   mlir::Value emitPromotedScalarExpr(const Expr *e, QualType promotionType);
 
   /// Emit the computation of the specified expression of scalar type.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index c1434ee697f4c..8b2883b50d2e2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1258,6 +1258,8 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
   case Decl::Enum:
   case Decl::Using:          // using X; [C++]
   case Decl::UsingDirective: // using namespace X; [C++]
+  case Decl::UsingEnum:      // using enum X; [C++]
+  case Decl::NamespaceAlias:
   case Decl::Typedef:
   case Decl::TypeAlias: // using foo = bar; [C++11]
   case Decl::Record:
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index e1b0f805a7b21..0a6dba5e80a62 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -186,6 +186,8 @@ class LValue {
   bool isBitField() const { return lvType == BitField; }
   bool isVolatile() const { return quals.hasVolatile(); }
 
+  bool isVolatileQualified() const { return quals.hasVolatile(); }
+
   unsigned getVRQualifiers() const {
     return quals.getCVRQualifiers() & ~clang::Qualifiers::Const;
   }
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 8512b229c2663..5fe5ac827fff9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2066,6 +2066,10 @@ LogicalResult cir::ComplexRealOp::verify() {
 }
 
 OpFoldResult cir::ComplexRealOp::fold(FoldAdaptor adaptor) {
+  if (auto complexCreateOp =
+          dyn_cast_or_null<cir::ComplexCreateOp>(getOperand().getDefiningOp()))
+    return complexCreateOp.getOperand(0);
+
   auto complex =
       mlir::cast_if_present<cir::ConstComplexAttr>(adaptor.getOperand());
   return complex ? complex.getReal() : nullptr;
@@ -2084,11 +2088,33 @@ LogicalResult cir::ComplexImagOp::verify() {
 }
 
 OpFoldResult cir::ComplexImagOp::fold(FoldAdaptor adaptor) {
+  if (auto complexCreateOp =
+          dyn_cast_or_null<cir::ComplexCreateOp>(getOperand().getDefiningOp()))
+    return complexCreateOp.getOperand(1);
+
   auto complex =
       mlir::cast_if_present<cir::ConstComplexAttr>(adaptor.getOperand());
   return complex ? complex.getImag() : nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexRealPtrOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::ComplexRealPtrOp::verify() {
+  mlir::Type resultPointeeTy = getType().getPointee();
+  cir::PointerType operandPtrTy = getOperand().getType();
+  auto operandPointeeTy =
+      mlir::cast<cir::ComplexType>(operandPtrTy.getPointee());
+
+  if (resultPointeeTy != operandPointeeTy.getElementType()) {
+    emitOpError() << ": result type does not match operand type";
+    return failure();
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index af307f6ad673d..3446265769a2c 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2048,9 +2048,11 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMBrOpLowering,
                CIRToLLVMCallOpLowering,
                CIRToLLVMCmpOpLowering,
+               CIRToLLVMComplexAddOpLowering,
                CIRToLLVMComplexCreateOpLowering,
                CIRToLLVMComplexImagOpLowering,
                CIRToLLVMComplexRealOpLowering,
+               CIRToLLVMComplexRealPtrOpLowering,
                CIRToLLVMConstantOpLowering,
                CIRToLLVMExpectOpLowering,
                CIRToLLVMFuncOpLowering,
@@ -2058,6 +2060,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMGetGlobalOpLowering,
                CIRToLLVMGetMemberOpLowering,
                CIRToLLVMSelectOpLowering,
+               CIRToLLVMSetBitfieldOpLowering,
                CIRToLLVMShiftOpLowering,
                CIRToLLVMStackRestoreOpLowering,
                CIRToLLVMStackSaveOpLowering,
@@ -2357,6 +2360,55 @@ mlir::LogicalResult CIRToLLVMVecTernaryOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMComplexAddOpLowering::matchAndRewrite(
+    cir::ComplexAddOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Value lhs = adaptor.getLhs();
+  mlir::Value rhs = adaptor.getRhs();
+  mlir::Location loc = op.getLoc();
+
+  auto complexType = mlir::cast<cir::ComplexType>(op.getLhs().getType());
+  mlir::Type complexElemTy =
+      getTypeConverter()->convertType(complexType.getElementType());
+  auto lhsReal =
+      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 0);
+  auto lhsImag =
+      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 1);
+  auto rhsReal =
+      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 0);
+  auto rhsImag =
+      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 1);
+
+  mlir::Value newReal;
+  mlir::Value newImag;
+  if (complexElemTy.isInteger()) {
+    newReal = rewriter.create<mlir::LLVM::AddOp>(loc, complexElemTy, lhsReal,
+                                                 rhsReal);
+    newImag = rewriter.create<mlir::LLVM::AddOp>(loc, complexElemTy, lhsImag,
+                                                 rhsImag);
+  } else {
+    assert(!cir::MissingFeatures::fastMathFlags());
+    assert(!cir::MissingFeatures::fpConstraints());
+    newReal = rewriter.create<mlir::LLVM::FAddOp>(loc, complexElemTy, lhsReal,
+                                                  rhsReal);
+    newImag = rewriter.create<mlir::LLVM::FAddOp>(loc, complexElemTy, lhsImag,
+                                                  rhsImag);
+  }
+
+  mlir::Type complexLLVMTy =
+      getTypeConverter()->convertType(op.getResult().getType());
+  auto initialComplex =
+      rewriter.create<mlir::LLVM::PoisonOp>(op->getLoc(), complexLLVMTy);
+
+  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
+      op->getLoc(), initialComplex, newReal, 0);
+
+  rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(op, realComplex,
+                                                         newImag, 1);
+
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite(
     cir::ComplexCreateOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -2393,6 +2445,105 @@ mlir::LogicalResult CIRToLLVMComplexImagOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::IntegerType computeBitfieldIntType(mlir::Type storageType,
+                                         mlir::MLIRContext *context,
+                                         unsigned &storageSize) {
+  return TypeSwitch<mlir::Type, mlir::IntegerType>(storageType)
+      .Case<cir::ArrayType>([&](cir::ArrayType atTy) {
+        storageSize = atTy.getSize() * 8;
+        return mlir::IntegerType::get(context, storageSize);
+      })
+      .Case<cir::IntType>([&](cir::IntType intTy) {
+        storageSize = intTy.getWidth();
+        return mlir::IntegerType::get(context, storageSize);
+      })
+      .Default([](mlir::Type) -> mlir::IntegerType {
+        llvm_unreachable(
+            "Either ArrayType or IntType expected for bitfields storage");
+      });
+}
+
+mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite(
+    cir::SetBitfieldOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(op);
+
+  cir::BitfieldInfoAttr info = op.getBitfieldInfo();
+  uint64_t size = info.getSize();
+  uint64_t offset = info.getOffset();
+  mlir::Type storageType = info.getStorageType();
+  mlir::MLIRContext *context = storageType.getContext();
+
+  unsigned storageSize = 0;
+
+  mlir::IntegerType intType =
+      computeBitfieldIntType(storageType, context, storageSize);
+
+  mlir::Value srcVal = createIntCast(rewriter, adaptor.getSrc(), intType);
+  unsigned srcWidth = storageSize;
+  mlir::Value resultVal = srcVal;
+
+  if (storageSize != size) {
+    assert(storageSize > size && "Invalid bitfield size.");
+
+    mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
+        op.getLoc(), intType, adaptor.getAddr(), /* alignment */ 0,
+        op.getIsVolatile());
+
+    srcVal =
+        createAnd(rewriter, srcVal, llvm::APInt::getLowBitsSet(srcWidth, size));
+    resultVal = srcVal;
+    srcVal = createShL(rewriter, srcVal, offset);
+
+    // Mask out the original value.
+    val = createAnd(rewriter, val,
+                    ~llvm::APInt::getBitsSet(srcWidth, offset, offset + size));
+
+    // Or together the unchanged values and the source value.
+    srcVal = rewriter.create<mlir::LLVM::OrOp>(op.getLoc(), val, srcVal);
+  }
+
+  rewriter.create<mlir::LLVM::StoreOp>(op.getLoc(), srcVal, adaptor.getAddr(),
+                                       /* alignment */ 0, op.getIsVolatile());
+
+  mlir::Type resultTy = getTypeConverter()->convertType(op.getType());
+
+  if (info.getIsSigned()) {
+    assert(size <= storageSize);
+    unsigned highBits = storageSize - size;
+
+    if (highBits) {
+      resultVal = createShL(rewriter, resultVal, highBits);
+      resultVal = createAShR(rewriter, resultVal, highBits);
+    }
+  }
+
+  resultVal = createIntCast(rewriter, resultVal,
+                            mlir::cast<mlir::IntegerType>(resultTy),
+                            info.getIsSigned());
+
+  rewriter.replaceOp(op, resultVal);
+  return mlir::success();
+}
+
+mlir::LogicalResult CIRToLLVMComplexRealPtrOpLowering::matchAndRewrite(
+    cir::ComplexRealPtrOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  cir::PointerType operandTy = op.getOperand().getType();
+  mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
+  mlir::Type elementLLVMTy =
+      getTypeConverter()->convertType(operandTy.getPointee());
+
+  mlir::LLVM::GEPArg gepIndices[2] = {0, 0};
+  mlir::LLVM::GEPNoWrapFlags inboundsNuw =
+      mlir::LLVM::GEPNoWrapFlags::inbounds | mlir::LLVM::GEPNoWrapFlags::nuw;
+  rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+      op, resultLLVMTy, elementLLVMTy, adaptor.getOperand(), gepIndices,
+      inboundsNuw);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite(
     cir::GetBitfieldOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -2408,19 +2559,7 @@ mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite(
   unsigned storageSize = 0;
 
   mlir::IntegerType intType =
-      TypeSwitch<mlir::Type, mlir::IntegerType>(storageType)
-          .Case<cir::ArrayType>([&](cir::ArrayType atTy) {
-            storageSize = atTy.getSize() * 8;
-            return mlir::IntegerType::get(context, storageSize);
-          })
-          .Case<cir::IntType>([&](cir::IntType intTy) {
-            storageSize = intTy.getWidth();
-            return mlir::IntegerType::get(context, storageSize);
-          })
-          .Default([](mlir::Type) -> mlir::IntegerType {
-            llvm_unreachable(
-                "Either ArrayType or IntType expected for bitfields storage");
-          });
+      computeBitfieldIntType(storageType, context, storageSize);
 
   mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
       op.getLoc(), intType, adaptor.getAddr(), 0, op.getIsVolatile());
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index d9fb91066317b..ed158eb7289dd 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -513,6 +513,36 @@ class CIRToLLVMComplexImagOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMComplexRealPtrOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexRealPtrOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexRealPtrOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexRealPtrOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
+class CIRToLLVMComplexAddOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexAddOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexAddOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexAddOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
+class CIRToLLVMSetBitfieldOpLowering
+    : public mlir::OpConversionPattern<cir::SetBitfieldOp> {
+public:
+  using mlir::OpConversionPattern<cir::SetBitfieldOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::SetBitfieldOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMGetBitfieldOpLowering
     : public mlir::OpConversionPattern<cir::GetBitfieldOp> {
 public:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 48c91eb4a5b4f..5f2eb76e7bacb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5987,8 +5987,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       auto *Zero = llvm::ConstantInt::get(IntTy, 0);
       for (unsigned I = First; I < NumArgs; ++I) {
         auto *Index = llvm::ConstantInt::get(IntTy, I - First);
-        auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
-                                      {Zero, Index});
+        auto *GEP =
+            Builder.CreateGEP(Tmp.getElementType(), Alloca, {Zero, Index});
         if (I == First)
           ElemPtr = GEP;
         auto *V =
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b76163afc8aa4..fe1865888bdd0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7047,6 +7047,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fapinotes-modules");
   Args.AddLastArg(CmdArgs, options::OPT_fapinotes_swift_version);
 
+  if (Args.hasFlag(options::OPT_fswift_version_independent_apinotes,
+                   options::OPT_fno_swift_version_independent_apinotes, false))
+    CmdArgs.push_back("-fswift-version-independent-apinotes");
+
   // -fblocks=0 is default.
   if (Args.hasFlag(options::OPT_fblocks, options::OPT_fno_blocks,
                    TC.IsBlocksDefault()) ||
@@ -9152,7 +9156,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       // specific architecture via -Xarch_<cpu> will not be forwarded.
       ArgStringList CompilerArgs;
       ArgStringList LinkerArgs;
-      for (Arg *A : C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)) {
+      const DerivedArgList &ToolChainArgs =
+          C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind);
+      for (Arg *A : ToolChainArgs) {
         if (A->getOption().matches(OPT_Zlinker_input))
           LinkerArgs.emplace_back(A->getValue());
         else if (ShouldForward(CompilerOptions, A))
@@ -9161,6 +9167,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
           A->render(Args, LinkerArgs);
       }
 
+      // If the user explicitly requested it via `--offload-arch` we should
+      // extract it from any static libraries if present.
+      for (StringRef Arg : ToolChainArgs.getAllArgValues(OPT_offload_arch_EQ))
+        CmdArgs.emplace_back(Args.MakeArgString("--should-extract=" + Arg));
+
       // If this is OpenMP the device linker will need `-lompdevice`.
       if (Kind == Action::OFK_OpenMP && !Args.hasArg(OPT_no_offloadlib) &&
           (TC->getTriple().isAMDGPU() || TC->getTriple().isNVPTX()))
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index def0d73e77539..24912c25ef8c6 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -26,18 +26,6 @@ namespace clang {
 namespace format {
 
 static constexpr StringRef Blanks = " \t\v\f\r";
-static bool IsBlank(char C) {
-  switch (C) {
-  case ' ':
-  case '\t':
-  case '\v':
-  case '\f':
-  case '\r':
-    return true;
-  default:
-    return false;
-  }
-}
 
 static StringRef getLineCommentIndentPrefix(StringRef Comment,
                                             const FormatStyle &Style) {
@@ -193,7 +181,7 @@ getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit,
     if (Chars > MaxSplit || Text.size() <= Advance)
       break;
 
-    if (IsBlank(Text[0]))
+    if (Blanks.contains(Text[0]))
       SpaceOffset = SplitPoint;
     if (Text[0] == '/')
       SlashOffset = SplitPoint;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 40b62b2a993d8..d8ee5cb90aaa4 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -1329,6 +1329,8 @@ FormatToken *FormatTokenLexer::getNextToken() {
   if (FormatTok->is(tok::unknown))
     FormatTok->setType(TT_ImplicitStringLiteral);
 
+  const bool IsCpp = Style.isCpp();
+
   // JavaScript and Java do not allow to escape the end of the line with a
   // backslash. Backslashes are syntax errors in plain source, but can occur in
   // comments. When a single line comment ends with a \, it'll cause the next
@@ -1336,16 +1338,17 @@ FormatToken *FormatTokenLexer::getNextToken() {
   // finds comments that contain a backslash followed by a line break, truncates
   // the comment token at the backslash, and resets the lexer to restart behind
   // the backslash.
-  if ((Style.isJavaScript() || Style.isJava()) && FormatTok->is(tok::comment) &&
-      FormatTok->TokenText.starts_with("//")) {
-    size_t BackslashPos = FormatTok->TokenText.find('\\');
-    while (BackslashPos != StringRef::npos) {
-      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
-          FormatTok->TokenText[BackslashPos + 1] == '\n') {
-        truncateToken(BackslashPos + 1);
+  if (const auto Text = FormatTok->TokenText;
+      Text.starts_with("//") &&
+      (IsCpp || Style.isJavaScript() || Style.isJava())) {
+    assert(FormatTok->is(tok::comment));
+    for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
+         Pos = Text.find('\\', Pos)) {
+      if (Pos < Text.size() && Text[Pos] == '\n' &&
+          (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
+        truncateToken(Pos);
         break;
       }
-      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
     }
   }
 
@@ -1450,7 +1453,7 @@ FormatToken *FormatTokenLexer::getNextToken() {
     Column = FormatTok->LastLineColumnWidth;
   }
 
-  if (Style.isCpp()) {
+  if (IsCpp) {
     auto *Identifier = FormatTok->Tok.getIdentifierInfo();
     auto it = Macros.find(Identifier);
     if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp
index cf86c62f3b671..dc5f6faefbab4 100644
--- a/clang/lib/Parse/ParseHLSLRootSignature.cpp
+++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp
@@ -25,44 +25,41 @@ RootSignatureParser::RootSignatureParser(
       Lexer(Signature->getString()), PP(PP), CurToken(0) {}
 
 bool RootSignatureParser::parse() {
-  // Iterate as many RootElements as possible
-  do {
+  // Iterate as many RootSignatureElements as possible, until we hit the
+  // end of the stream
+  while (!peekExpectedToken(TokenKind::end_of_stream)) {
     if (tryConsumeExpectedToken(TokenKind::kw_RootFlags)) {
       auto Flags = parseRootFlags();
       if (!Flags.has_value())
         return true;
       Elements.push_back(*Flags);
-    }
-
-    if (tryConsumeExpectedToken(TokenKind::kw_RootConstants)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_RootConstants)) {
       auto Constants = parseRootConstants();
       if (!Constants.has_value())
         return true;
       Elements.push_back(*Constants);
-    }
-
-    if (tryConsumeExpectedToken(TokenKind::kw_DescriptorTable)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_DescriptorTable)) {
       auto Table = parseDescriptorTable();
       if (!Table.has_value())
         return true;
       Elements.push_back(*Table);
-    }
-
-    if (tryConsumeExpectedToken(
-            {TokenKind::kw_CBV, TokenKind::kw_SRV, TokenKind::kw_UAV})) {
+    } else if (tryConsumeExpectedToken(
+                   {TokenKind::kw_CBV, TokenKind::kw_SRV, TokenKind::kw_UAV})) {
       auto Descriptor = parseRootDescriptor();
       if (!Descriptor.has_value())
         return true;
       Elements.push_back(*Descriptor);
-    }
-
-    if (tryConsumeExpectedToken(TokenKind::kw_StaticSampler)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_StaticSampler)) {
       auto Sampler = parseStaticSampler();
       if (!Sampler.has_value())
         return true;
       Elements.push_back(*Sampler);
     }
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
+
+    // ',' denotes another element, otherwise, expected to be at end of stream
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   return consumeExpectedToken(TokenKind::end_of_stream,
                               diag::err_hlsl_unexpected_end_of_params,
@@ -139,6 +136,11 @@ std::optional<RootConstants> RootSignatureParser::parseRootConstants() {
   if (!Params.has_value())
     return std::nullopt;
 
+  if (consumeExpectedToken(TokenKind::pu_r_paren,
+                           diag::err_hlsl_unexpected_end_of_params,
+                           /*param of=*/TokenKind::kw_RootConstants))
+    return std::nullopt;
+
   // Check mandatory parameters where provided
   if (!Params->Num32BitConstants.has_value()) {
     reportDiag(diag::err_hlsl_rootsig_missing_param)
@@ -162,11 +164,6 @@ std::optional<RootConstants> RootSignatureParser::parseRootConstants() {
   if (Params->Space.has_value())
     Constants.Space = Params->Space.value();
 
-  if (consumeExpectedToken(TokenKind::pu_r_paren,
-                           diag::err_hlsl_unexpected_end_of_params,
-                           /*param of=*/TokenKind::kw_RootConstants))
-    return std::nullopt;
-
   return Constants;
 }
 
@@ -206,6 +203,11 @@ std::optional<RootDescriptor> RootSignatureParser::parseRootDescriptor() {
   if (!Params.has_value())
     return std::nullopt;
 
+  if (consumeExpectedToken(TokenKind::pu_r_paren,
+                           diag::err_hlsl_unexpected_end_of_params,
+                           /*param of=*/DescriptorKind))
+    return std::nullopt;
+
   // Check mandatory parameters were provided
   if (!Params->Reg.has_value()) {
     reportDiag(diag::err_hlsl_rootsig_missing_param) << ExpectedReg;
@@ -224,11 +226,6 @@ std::optional<RootDescriptor> RootSignatureParser::parseRootDescriptor() {
   if (Params->Flags.has_value())
     Descriptor.Flags = Params->Flags.value();
 
-  if (consumeExpectedToken(TokenKind::pu_r_paren,
-                           diag::err_hlsl_unexpected_end_of_params,
-                           /*param of=*/TokenKind::kw_RootConstants))
-    return std::nullopt;
-
   return Descriptor;
 }
 
@@ -243,18 +240,18 @@ std::optional<DescriptorTable> RootSignatureParser::parseDescriptorTable() {
   DescriptorTable Table;
   std::optional<llvm::dxbc::ShaderVisibility> Visibility;
 
-  // Iterate as many Clauses as possible
-  do {
+  // Iterate as many Clauses as possible, until we hit ')'
+  while (!peekExpectedToken(TokenKind::pu_r_paren)) {
     if (tryConsumeExpectedToken({TokenKind::kw_CBV, TokenKind::kw_SRV,
                                  TokenKind::kw_UAV, TokenKind::kw_Sampler})) {
+      // DescriptorTableClause - CBV, SRV, UAV, or Sampler
       auto Clause = parseDescriptorTableClause();
       if (!Clause.has_value())
         return std::nullopt;
       Elements.push_back(*Clause);
       Table.NumClauses++;
-    }
-
-    if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+      // visibility = SHADER_VISIBILITY
       if (Visibility.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -267,17 +264,21 @@ std::optional<DescriptorTable> RootSignatureParser::parseDescriptorTable() {
       if (!Visibility.has_value())
         return std::nullopt;
     }
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
 
-  // Fill in optional visibility
-  if (Visibility.has_value())
-    Table.Visibility = Visibility.value();
+    // ',' denotes another element, otherwise, expected to be at ')'
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   if (consumeExpectedToken(TokenKind::pu_r_paren,
                            diag::err_hlsl_unexpected_end_of_params,
                            /*param of=*/TokenKind::kw_DescriptorTable))
     return std::nullopt;
 
+  // Fill in optional visibility
+  if (Visibility.has_value())
+    Table.Visibility = Visibility.value();
+
   return Table;
 }
 
@@ -323,6 +324,11 @@ RootSignatureParser::parseDescriptorTableClause() {
   if (!Params.has_value())
     return std::nullopt;
 
+  if (consumeExpectedToken(TokenKind::pu_r_paren,
+                           diag::err_hlsl_unexpected_end_of_params,
+                           /*param of=*/ParamKind))
+    return std::nullopt;
+
   // Check mandatory parameters were provided
   if (!Params->Reg.has_value()) {
     reportDiag(diag::err_hlsl_rootsig_missing_param) << ExpectedReg;
@@ -344,11 +350,6 @@ RootSignatureParser::parseDescriptorTableClause() {
   if (Params->Flags.has_value())
     Clause.Flags = Params->Flags.value();
 
-  if (consumeExpectedToken(TokenKind::pu_r_paren,
-                           diag::err_hlsl_unexpected_end_of_params,
-                           /*param of=*/ParamKind))
-    return std::nullopt;
-
   return Clause;
 }
 
@@ -366,6 +367,11 @@ std::optional<StaticSampler> RootSignatureParser::parseStaticSampler() {
   if (!Params.has_value())
     return std::nullopt;
 
+  if (consumeExpectedToken(TokenKind::pu_r_paren,
+                           diag::err_hlsl_unexpected_end_of_params,
+                           /*param of=*/TokenKind::kw_StaticSampler))
+    return std::nullopt;
+
   // Check mandatory parameters were provided
   if (!Params->Reg.has_value()) {
     reportDiag(diag::err_hlsl_rootsig_missing_param) << TokenKind::sReg;
@@ -411,11 +417,6 @@ std::optional<StaticSampler> RootSignatureParser::parseStaticSampler() {
   if (Params->Visibility.has_value())
     Sampler.Visibility = Params->Visibility.value();
 
-  if (consumeExpectedToken(TokenKind::pu_r_paren,
-                           diag::err_hlsl_unexpected_end_of_params,
-                           /*param of=*/TokenKind::kw_StaticSampler))
-    return std::nullopt;
-
   return Sampler;
 }
 
@@ -428,9 +429,9 @@ RootSignatureParser::parseRootConstantParams() {
          "Expects to only be invoked starting at given token");
 
   ParsedConstantParams Params;
-  do {
-    // `num32BitConstants` `=` POS_INT
+  while (!peekExpectedToken(TokenKind::pu_r_paren)) {
     if (tryConsumeExpectedToken(TokenKind::kw_num32BitConstants)) {
+      // `num32BitConstants` `=` POS_INT
       if (Params.Num32BitConstants.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -443,10 +444,8 @@ RootSignatureParser::parseRootConstantParams() {
       if (!Num32BitConstants.has_value())
         return std::nullopt;
       Params.Num32BitConstants = Num32BitConstants;
-    }
-
-    // `b` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::bReg)) {
+    } else if (tryConsumeExpectedToken(TokenKind::bReg)) {
+      // `b` POS_INT
       if (Params.Reg.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -455,10 +454,8 @@ RootSignatureParser::parseRootConstantParams() {
       if (!Reg.has_value())
         return std::nullopt;
       Params.Reg = Reg;
-    }
-
-    // `space` `=` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+      // `space` `=` POS_INT
       if (Params.Space.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -471,10 +468,8 @@ RootSignatureParser::parseRootConstantParams() {
       if (!Space.has_value())
         return std::nullopt;
       Params.Space = Space;
-    }
-
-    // `visibility` `=` SHADER_VISIBILITY
-    if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+      // `visibility` `=` SHADER_VISIBILITY
       if (Params.Visibility.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -488,7 +483,11 @@ RootSignatureParser::parseRootConstantParams() {
         return std::nullopt;
       Params.Visibility = Visibility;
     }
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
+
+    // ',' denotes another element, otherwise, expected to be at ')'
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   return Params;
 }
@@ -499,9 +498,9 @@ RootSignatureParser::parseRootDescriptorParams(TokenKind RegType) {
          "Expects to only be invoked starting at given token");
 
   ParsedRootDescriptorParams Params;
-  do {
-    // ( `b` | `t` | `u`) POS_INT
+  while (!peekExpectedToken(TokenKind::pu_r_paren)) {
     if (tryConsumeExpectedToken(RegType)) {
+      // ( `b` | `t` | `u`) POS_INT
       if (Params.Reg.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -510,10 +509,8 @@ RootSignatureParser::parseRootDescriptorParams(TokenKind RegType) {
       if (!Reg.has_value())
         return std::nullopt;
       Params.Reg = Reg;
-    }
-
-    // `space` `=` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+      // `space` `=` POS_INT
       if (Params.Space.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -526,10 +523,8 @@ RootSignatureParser::parseRootDescriptorParams(TokenKind RegType) {
       if (!Space.has_value())
         return std::nullopt;
       Params.Space = Space;
-    }
-
-    // `visibility` `=` SHADER_VISIBILITY
-    if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+      // `visibility` `=` SHADER_VISIBILITY
       if (Params.Visibility.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -542,10 +537,8 @@ RootSignatureParser::parseRootDescriptorParams(TokenKind RegType) {
       if (!Visibility.has_value())
         return std::nullopt;
       Params.Visibility = Visibility;
-    }
-
-    // `flags` `=` ROOT_DESCRIPTOR_FLAGS
-    if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+      // `flags` `=` ROOT_DESCRIPTOR_FLAGS
       if (Params.Flags.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -559,7 +552,11 @@ RootSignatureParser::parseRootDescriptorParams(TokenKind RegType) {
         return std::nullopt;
       Params.Flags = Flags;
     }
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
+
+    // ',' denotes another element, otherwise, expected to be at ')'
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   return Params;
 }
@@ -570,9 +567,9 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
          "Expects to only be invoked starting at given token");
 
   ParsedClauseParams Params;
-  do {
-    // ( `b` | `t` | `u` | `s`) POS_INT
+  while (!peekExpectedToken(TokenKind::pu_r_paren)) {
     if (tryConsumeExpectedToken(RegType)) {
+      // ( `b` | `t` | `u` | `s`) POS_INT
       if (Params.Reg.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -581,10 +578,8 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
       if (!Reg.has_value())
         return std::nullopt;
       Params.Reg = Reg;
-    }
-
-    // `numDescriptors` `=` POS_INT | unbounded
-    if (tryConsumeExpectedToken(TokenKind::kw_numDescriptors)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_numDescriptors)) {
+      // `numDescriptors` `=` POS_INT | unbounded
       if (Params.NumDescriptors.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -603,10 +598,8 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
       }
 
       Params.NumDescriptors = NumDescriptors;
-    }
-
-    // `space` `=` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+      // `space` `=` POS_INT
       if (Params.Space.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -619,10 +612,8 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
       if (!Space.has_value())
         return std::nullopt;
       Params.Space = Space;
-    }
-
-    // `offset` `=` POS_INT | DESCRIPTOR_RANGE_OFFSET_APPEND
-    if (tryConsumeExpectedToken(TokenKind::kw_offset)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_offset)) {
+      // `offset` `=` POS_INT | DESCRIPTOR_RANGE_OFFSET_APPEND
       if (Params.Offset.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -641,10 +632,8 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
       }
 
       Params.Offset = Offset;
-    }
-
-    // `flags` `=` DESCRIPTOR_RANGE_FLAGS
-    if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+      // `flags` `=` DESCRIPTOR_RANGE_FLAGS
       if (Params.Flags.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -659,7 +648,10 @@ RootSignatureParser::parseDescriptorTableClauseParams(TokenKind RegType) {
       Params.Flags = Flags;
     }
 
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
+    // ',' denotes another element, otherwise, expected to be at ')'
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   return Params;
 }
@@ -670,9 +662,9 @@ RootSignatureParser::parseStaticSamplerParams() {
          "Expects to only be invoked starting at given token");
 
   ParsedStaticSamplerParams Params;
-  do {
-    // `s` POS_INT
+  while (!peekExpectedToken(TokenKind::pu_r_paren)) {
     if (tryConsumeExpectedToken(TokenKind::sReg)) {
+      // `s` POS_INT
       if (Params.Reg.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -681,10 +673,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!Reg.has_value())
         return std::nullopt;
       Params.Reg = Reg;
-    }
-
-    // `filter` `=` FILTER
-    if (tryConsumeExpectedToken(TokenKind::kw_filter)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_filter)) {
+      // `filter` `=` FILTER
       if (Params.Filter.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -697,10 +687,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!Filter.has_value())
         return std::nullopt;
       Params.Filter = Filter;
-    }
-
-    // `addressU` `=` TEXTURE_ADDRESS
-    if (tryConsumeExpectedToken(TokenKind::kw_addressU)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_addressU)) {
+      // `addressU` `=` TEXTURE_ADDRESS
       if (Params.AddressU.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -713,10 +701,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!AddressU.has_value())
         return std::nullopt;
       Params.AddressU = AddressU;
-    }
-
-    // `addressV` `=` TEXTURE_ADDRESS
-    if (tryConsumeExpectedToken(TokenKind::kw_addressV)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_addressV)) {
+      // `addressV` `=` TEXTURE_ADDRESS
       if (Params.AddressV.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -729,10 +715,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!AddressV.has_value())
         return std::nullopt;
       Params.AddressV = AddressV;
-    }
-
-    // `addressW` `=` TEXTURE_ADDRESS
-    if (tryConsumeExpectedToken(TokenKind::kw_addressW)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_addressW)) {
+      // `addressW` `=` TEXTURE_ADDRESS
       if (Params.AddressW.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -745,10 +729,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!AddressW.has_value())
         return std::nullopt;
       Params.AddressW = AddressW;
-    }
-
-    // `mipLODBias` `=` NUMBER
-    if (tryConsumeExpectedToken(TokenKind::kw_mipLODBias)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_mipLODBias)) {
+      // `mipLODBias` `=` NUMBER
       if (Params.MipLODBias.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -761,10 +743,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!MipLODBias.has_value())
         return std::nullopt;
       Params.MipLODBias = MipLODBias;
-    }
-
-    // `maxAnisotropy` `=` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::kw_maxAnisotropy)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_maxAnisotropy)) {
+      // `maxAnisotropy` `=` POS_INT
       if (Params.MaxAnisotropy.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -777,10 +757,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!MaxAnisotropy.has_value())
         return std::nullopt;
       Params.MaxAnisotropy = MaxAnisotropy;
-    }
-
-    // `comparisonFunc` `=` COMPARISON_FUNC
-    if (tryConsumeExpectedToken(TokenKind::kw_comparisonFunc)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_comparisonFunc)) {
+      // `comparisonFunc` `=` COMPARISON_FUNC
       if (Params.CompFunc.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -793,10 +771,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!CompFunc.has_value())
         return std::nullopt;
       Params.CompFunc = CompFunc;
-    }
-
-    // `borderColor` `=` STATIC_BORDER_COLOR
-    if (tryConsumeExpectedToken(TokenKind::kw_borderColor)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_borderColor)) {
+      // `borderColor` `=` STATIC_BORDER_COLOR
       if (Params.BorderColor.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -809,10 +785,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!BorderColor.has_value())
         return std::nullopt;
       Params.BorderColor = BorderColor;
-    }
-
-    // `minLOD` `=` NUMBER
-    if (tryConsumeExpectedToken(TokenKind::kw_minLOD)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_minLOD)) {
+      // `minLOD` `=` NUMBER
       if (Params.MinLOD.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -825,10 +799,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!MinLOD.has_value())
         return std::nullopt;
       Params.MinLOD = MinLOD;
-    }
-
-    // `maxLOD` `=` NUMBER
-    if (tryConsumeExpectedToken(TokenKind::kw_maxLOD)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_maxLOD)) {
+      // `maxLOD` `=` NUMBER
       if (Params.MaxLOD.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -841,10 +813,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!MaxLOD.has_value())
         return std::nullopt;
       Params.MaxLOD = MaxLOD;
-    }
-
-    // `space` `=` POS_INT
-    if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_space)) {
+      // `space` `=` POS_INT
       if (Params.Space.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -857,10 +827,8 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!Space.has_value())
         return std::nullopt;
       Params.Space = Space;
-    }
-
-    // `visibility` `=` SHADER_VISIBILITY
-    if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+    } else if (tryConsumeExpectedToken(TokenKind::kw_visibility)) {
+      // `visibility` `=` SHADER_VISIBILITY
       if (Params.Visibility.has_value()) {
         reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
         return std::nullopt;
@@ -874,7 +842,11 @@ RootSignatureParser::parseStaticSamplerParams() {
         return std::nullopt;
       Params.Visibility = Visibility;
     }
-  } while (tryConsumeExpectedToken(TokenKind::pu_comma));
+
+    // ',' denotes another element, otherwise, expected to be at ')'
+    if (!tryConsumeExpectedToken(TokenKind::pu_comma))
+      break;
+  }
 
   return Params;
 }
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 7420ba2d461c6..ec8acbdff3b49 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -29,6 +29,7 @@
 #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h"
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
+#include "clang/Analysis/Analyses/LifetimeSafety.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -49,6 +50,7 @@
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
 #include <algorithm>
 #include <deque>
 #include <iterator>
@@ -985,11 +987,10 @@ static void DiagUninitUse(Sema &S, const VarDecl *VD, const UninitUse &Use,
 }
 
 /// Diagnose uninitialized const reference usages.
-static bool DiagnoseUninitializedConstRefUse(Sema &S, const VarDecl *VD,
+static void DiagnoseUninitializedConstRefUse(Sema &S, const VarDecl *VD,
                                              const UninitUse &Use) {
   S.Diag(Use.getUser()->getBeginLoc(), diag::warn_uninit_const_reference)
       << VD->getDeclName() << Use.getUser()->getSourceRange();
-  return true;
 }
 
 /// DiagnoseUninitializedUse -- Helper function for diagnosing uses of an
@@ -1531,14 +1532,13 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
   // order of diagnostics when calling flushDiagnostics().
   typedef llvm::MapVector<const VarDecl *, MappedType> UsesMap;
   UsesMap uses;
-  UsesMap constRefUses;
 
 public:
   UninitValsDiagReporter(Sema &S) : S(S) {}
   ~UninitValsDiagReporter() override { flushDiagnostics(); }
 
-  MappedType &getUses(UsesMap &um, const VarDecl *vd) {
-    MappedType &V = um[vd];
+  MappedType &getUses(const VarDecl *vd) {
+    MappedType &V = uses[vd];
     if (!V.getPointer())
       V.setPointer(new UsesVec());
     return V;
@@ -1546,18 +1546,10 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
 
   void handleUseOfUninitVariable(const VarDecl *vd,
                                  const UninitUse &use) override {
-    getUses(uses, vd).getPointer()->push_back(use);
-  }
-
-  void handleConstRefUseOfUninitVariable(const VarDecl *vd,
-                                         const UninitUse &use) override {
-    getUses(constRefUses, vd).getPointer()->push_back(use);
+    getUses(vd).getPointer()->push_back(use);
   }
 
-  void handleSelfInit(const VarDecl *vd) override {
-    getUses(uses, vd).setInt(true);
-    getUses(constRefUses, vd).setInt(true);
-  }
+  void handleSelfInit(const VarDecl *vd) override { getUses(vd).setInt(true); }
 
   void flushDiagnostics() {
     for (const auto &P : uses) {
@@ -1580,6 +1572,9 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
         // guaranteed to produce them in line/column order, this will provide
         // a stable ordering.
         llvm::sort(*vec, [](const UninitUse &a, const UninitUse &b) {
+          // Move ConstRef uses to the back.
+          if (a.isConstRefUse() != b.isConstRefUse())
+            return b.isConstRefUse();
           // Prefer a more confident report over a less confident one.
           if (a.getKind() != b.getKind())
             return a.getKind() > b.getKind();
@@ -1587,6 +1582,11 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
         });
 
         for (const auto &U : *vec) {
+          if (U.isConstRefUse()) {
+            DiagnoseUninitializedConstRefUse(S, vd, U);
+            break;
+          }
+
           // If we have self-init, downgrade all uses to 'may be uninitialized'.
           UninitUse Use = hasSelfInit ? UninitUse(U.getUser(), false) : U;
 
@@ -1602,32 +1602,6 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
     }
 
     uses.clear();
-
-    // Flush all const reference uses diags.
-    for (const auto &P : constRefUses) {
-      const VarDecl *vd = P.first;
-      const MappedType &V = P.second;
-
-      UsesVec *vec = V.getPointer();
-      bool hasSelfInit = V.getInt();
-
-      if (!vec->empty() && hasSelfInit && hasAlwaysUninitializedUse(vec))
-        DiagnoseUninitializedUse(S, vd,
-                                 UninitUse(vd->getInit()->IgnoreParenCasts(),
-                                           /* isAlwaysUninit */ true),
-                                 /* alwaysReportSelfInit */ true);
-      else {
-        for (const auto &U : *vec) {
-          if (DiagnoseUninitializedConstRefUse(S, vd, U))
-            break;
-        }
-      }
-
-      // Release the uses vector.
-      delete vec;
-    }
-
-    constRefUses.clear();
   }
 
 private:
@@ -2744,6 +2718,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
       .setAlwaysAdd(Stmt::UnaryOperatorClass);
   }
 
+  bool EnableLifetimeSafetyAnalysis = !Diags.isIgnored(
+      diag::warn_experimental_lifetime_safety_dummy_warning, D->getBeginLoc());
   // Install the logical handler.
   std::optional<LogicalErrorHandler> LEH;
   if (LogicalErrorHandler::hasActiveDiagnostics(Diags, D->getBeginLoc())) {
@@ -2866,6 +2842,12 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
     }
   }
 
+  // TODO: Enable lifetime safety analysis for other languages once it is
+  // stable.
+  if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) {
+    if (CFG *cfg = AC.getCFG())
+      runLifetimeSafetyAnalysis(*cast<DeclContext>(D), *cfg, AC);
+  }
   // Check for violations of "called once" parameter properties.
   if (S.getLangOpts().ObjC && !S.getLangOpts().CPlusPlus &&
       shouldAnalyzeCalledOnceParameters(Diags, D->getBeginLoc())) {
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
index e5c6220bfb47d..87f9ae07550c2 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
@@ -697,7 +697,9 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addArraySubscriptOperators() {
       AST.DeclarationNames.getCXXOperatorName(OO_Subscript);
 
   addHandleAccessFunction(Subscript, /*IsConst=*/true, /*IsRef=*/true);
-  addHandleAccessFunction(Subscript, /*IsConst=*/false, /*IsRef=*/true);
+  if (getResourceAttrs().ResourceClass == llvm::dxil::ResourceClass::UAV)
+    addHandleAccessFunction(Subscript, /*IsConst=*/false, /*IsRef=*/true);
+
   return *this;
 }
 
@@ -714,7 +716,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addLoadMethods() {
   return *this;
 }
 
-FieldDecl *BuiltinTypeDeclBuilder::getResourceHandleField() {
+FieldDecl *BuiltinTypeDeclBuilder::getResourceHandleField() const {
   auto I = Fields.find("__handle");
   assert(I != Fields.end() &&
          I->second->getType()->isHLSLAttributedResourceType() &&
@@ -738,6 +740,12 @@ QualType BuiltinTypeDeclBuilder::getHandleElementType() {
   return SemaRef.getASTContext().Char8Ty;
 }
 
+HLSLAttributedResourceType::Attributes
+BuiltinTypeDeclBuilder::getResourceAttrs() const {
+  QualType HandleType = getResourceHandleField()->getType();
+  return cast<HLSLAttributedResourceType>(HandleType)->getAttrs();
+}
+
 // BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::startDefinition() {
 //   assert(!Record->isCompleteDefinition() && "record is already complete");
 //   Record->startDefinition();
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
index a52e2938104c7..36c4add20b225 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
@@ -91,10 +91,11 @@ class BuiltinTypeDeclBuilder {
   BuiltinTypeDeclBuilder &addConsumeMethod();
 
 private:
-  FieldDecl *getResourceHandleField();
+  FieldDecl *getResourceHandleField() const;
   QualType getFirstTemplateTypeParam();
   QualType getHandleElementType();
   Expr *getConstantIntExpr(int value);
+  HLSLAttributedResourceType::Attributes getResourceAttrs() const;
 };
 
 } // namespace hlsl
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index f21cbbbdb44ee..044abb0ee08a8 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -52,63 +52,58 @@ static bool isIndirectPointerType(QualType Type) {
          Pointee->isMemberPointerType();
 }
 
-/// Apply nullability to the given declaration.
-static void applyNullability(Sema &S, Decl *D, NullabilityKind Nullability,
-                             VersionedInfoMetadata Metadata) {
-  if (!Metadata.IsActive)
-    return;
+static void applyAPINotesType(Sema &S, Decl *decl, StringRef typeString,
+                              VersionedInfoMetadata metadata) {
+  if (typeString.empty())
 
-  auto GetModified =
-      [&](Decl *D, QualType QT,
-          NullabilityKind Nullability) -> std::optional<QualType> {
-    QualType Original = QT;
-    S.CheckImplicitNullabilityTypeSpecifier(QT, Nullability, D->getLocation(),
-                                            isa<ParmVarDecl>(D),
-                                            /*OverrideExisting=*/true);
-    return (QT.getTypePtr() != Original.getTypePtr()) ? std::optional(QT)
-                                                      : std::nullopt;
-  };
-
-  if (auto Function = dyn_cast<FunctionDecl>(D)) {
-    if (auto Modified =
-            GetModified(D, Function->getReturnType(), Nullability)) {
-      const FunctionType *FnType = Function->getType()->castAs<FunctionType>();
-      if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(FnType))
-        Function->setType(S.Context.getFunctionType(
-            *Modified, proto->getParamTypes(), proto->getExtProtoInfo()));
-      else
-        Function->setType(
-            S.Context.getFunctionNoProtoType(*Modified, FnType->getExtInfo()));
-    }
-  } else if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
-    if (auto Modified = GetModified(D, Method->getReturnType(), Nullability)) {
-      Method->setReturnType(*Modified);
+    return;
 
-      // Make it a context-sensitive keyword if we can.
-      if (!isIndirectPointerType(*Modified))
-        Method->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
-            Method->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
-    }
-  } else if (auto Value = dyn_cast<ValueDecl>(D)) {
-    if (auto Modified = GetModified(D, Value->getType(), Nullability)) {
-      Value->setType(*Modified);
+  // Version-independent APINotes add "type" annotations
+  // with a versioned attribute for the client to select and apply.
+  if (S.captureSwiftVersionIndependentAPINotes()) {
+    auto *typeAttr = SwiftTypeAttr::CreateImplicit(S.Context, typeString);
+    auto *versioned = SwiftVersionedAdditionAttr::CreateImplicit(
+        S.Context, metadata.Version, typeAttr, metadata.IsReplacement);
+    decl->addAttr(versioned);
+  } else {
+    if (!metadata.IsActive)
+      return;
+    S.ApplyAPINotesType(decl, typeString);
+  }
+}
 
-      // Make it a context-sensitive keyword if we can.
-      if (auto Parm = dyn_cast<ParmVarDecl>(D)) {
-        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(*Modified))
-          Parm->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
-              Parm->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
-      }
+/// Apply nullability to the given declaration.
+static void applyNullability(Sema &S, Decl *decl, NullabilityKind nullability,
+                             VersionedInfoMetadata metadata) {
+  // Version-independent APINotes add "nullability" annotations
+  // with a versioned attribute for the client to select and apply.
+  if (S.captureSwiftVersionIndependentAPINotes()) {
+    SwiftNullabilityAttr::Kind attrNullabilityKind;
+    switch (nullability) {
+    case NullabilityKind::NonNull:
+      attrNullabilityKind = SwiftNullabilityAttr::Kind::NonNull;
+      break;
+    case NullabilityKind::Nullable:
+      attrNullabilityKind = SwiftNullabilityAttr::Kind::Nullable;
+      break;
+    case NullabilityKind::Unspecified:
+      attrNullabilityKind = SwiftNullabilityAttr::Kind::Unspecified;
+      break;
+    case NullabilityKind::NullableResult:
+      attrNullabilityKind = SwiftNullabilityAttr::Kind::NullableResult;
+      break;
     }
-  } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
-    if (auto Modified = GetModified(D, Property->getType(), Nullability)) {
-      Property->setType(*Modified, Property->getTypeSourceInfo());
+    auto *nullabilityAttr =
+        SwiftNullabilityAttr::CreateImplicit(S.Context, attrNullabilityKind);
+    auto *versioned = SwiftVersionedAdditionAttr::CreateImplicit(
+        S.Context, metadata.Version, nullabilityAttr, metadata.IsReplacement);
+    decl->addAttr(versioned);
+    return;
+  } else {
+    if (!metadata.IsActive)
+      return;
 
-      // Make it a property attribute if we can.
-      if (!isIndirectPointerType(*Modified))
-        Property->setPropertyAttributes(
-            ObjCPropertyAttribute::kind_null_resettable);
-    }
+    S.ApplyNullability(decl, nullability);
   }
 }
 
@@ -361,42 +356,99 @@ static bool checkAPINotesReplacementType(Sema &S, SourceLocation Loc,
   return false;
 }
 
-/// Process API notes for a variable or property.
-static void ProcessAPINotes(Sema &S, Decl *D,
-                            const api_notes::VariableInfo &Info,
-                            VersionedInfoMetadata Metadata) {
-  // Type override.
-  if (Metadata.IsActive && !Info.getType().empty() &&
-      S.ParseTypeFromStringCallback) {
-    auto ParsedType = S.ParseTypeFromStringCallback(
-        Info.getType(), "<API Notes>", D->getLocation());
+void Sema::ApplyAPINotesType(Decl *D, StringRef TypeString) {
+  if (!TypeString.empty() && ParseTypeFromStringCallback) {
+    auto ParsedType = ParseTypeFromStringCallback(TypeString, "<API Notes>",
+                                                  D->getLocation());
     if (ParsedType.isUsable()) {
       QualType Type = Sema::GetTypeFromParser(ParsedType.get());
-      auto TypeInfo =
-          S.Context.getTrivialTypeSourceInfo(Type, D->getLocation());
-
+      auto TypeInfo = Context.getTrivialTypeSourceInfo(Type, D->getLocation());
       if (auto Var = dyn_cast<VarDecl>(D)) {
         // Make adjustments to parameter types.
         if (isa<ParmVarDecl>(Var)) {
-          Type = S.ObjC().AdjustParameterTypeForObjCAutoRefCount(
+          Type = ObjC().AdjustParameterTypeForObjCAutoRefCount(
               Type, D->getLocation(), TypeInfo);
-          Type = S.Context.getAdjustedParameterType(Type);
+          Type = Context.getAdjustedParameterType(Type);
         }
 
-        if (!checkAPINotesReplacementType(S, Var->getLocation(), Var->getType(),
-                                          Type)) {
+        if (!checkAPINotesReplacementType(*this, Var->getLocation(),
+                                          Var->getType(), Type)) {
           Var->setType(Type);
           Var->setTypeSourceInfo(TypeInfo);
         }
-      } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
-        if (!checkAPINotesReplacementType(S, Property->getLocation(),
-                                          Property->getType(), Type))
-          Property->setType(Type, TypeInfo);
-
-      } else
+      } else if (auto property = dyn_cast<ObjCPropertyDecl>(D)) {
+        if (!checkAPINotesReplacementType(*this, property->getLocation(),
+                                          property->getType(), Type)) {
+          property->setType(Type, TypeInfo);
+        }
+      } else {
         llvm_unreachable("API notes allowed a type on an unknown declaration");
+      }
+    }
+  }
+}
+
+void Sema::ApplyNullability(Decl *D, NullabilityKind Nullability) {
+  auto GetModified =
+      [&](class Decl *D, QualType QT,
+          NullabilityKind Nullability) -> std::optional<QualType> {
+    QualType Original = QT;
+    CheckImplicitNullabilityTypeSpecifier(QT, Nullability, D->getLocation(),
+                                          isa<ParmVarDecl>(D),
+                                          /*OverrideExisting=*/true);
+    return (QT.getTypePtr() != Original.getTypePtr()) ? std::optional(QT)
+                                                      : std::nullopt;
+  };
+
+  if (auto Function = dyn_cast<FunctionDecl>(D)) {
+    if (auto Modified =
+            GetModified(D, Function->getReturnType(), Nullability)) {
+      const FunctionType *FnType = Function->getType()->castAs<FunctionType>();
+      if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(FnType))
+        Function->setType(Context.getFunctionType(
+            *Modified, proto->getParamTypes(), proto->getExtProtoInfo()));
+      else
+        Function->setType(
+            Context.getFunctionNoProtoType(*Modified, FnType->getExtInfo()));
+    }
+  } else if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
+    if (auto Modified = GetModified(D, Method->getReturnType(), Nullability)) {
+      Method->setReturnType(*Modified);
+
+      // Make it a context-sensitive keyword if we can.
+      if (!isIndirectPointerType(*Modified))
+        Method->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
+            Method->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
+    }
+  } else if (auto Value = dyn_cast<ValueDecl>(D)) {
+    if (auto Modified = GetModified(D, Value->getType(), Nullability)) {
+      Value->setType(*Modified);
+
+      // Make it a context-sensitive keyword if we can.
+      if (auto Parm = dyn_cast<ParmVarDecl>(D)) {
+        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(*Modified))
+          Parm->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
+              Parm->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
+      }
+    }
+  } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
+    if (auto Modified = GetModified(D, Property->getType(), Nullability)) {
+      Property->setType(*Modified, Property->getTypeSourceInfo());
+
+      // Make it a property attribute if we can.
+      if (!isIndirectPointerType(*Modified))
+        Property->setPropertyAttributes(
+            ObjCPropertyAttribute::kind_null_resettable);
     }
   }
+}
+
+/// Process API notes for a variable or property.
+static void ProcessAPINotes(Sema &S, Decl *D,
+                            const api_notes::VariableInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Type override.
+  applyAPINotesType(S, D, Info.getType(), Metadata);
 
   // Nullability.
   if (auto Nullability = Info.getNullability())
@@ -814,7 +866,8 @@ static void ProcessVersionedAPINotes(
     Sema &S, SpecificDecl *D,
     const api_notes::APINotesReader::VersionedInfo<SpecificInfo> Info) {
 
-  maybeAttachUnversionedSwiftName(S, D, Info);
+  if (!S.captureSwiftVersionIndependentAPINotes())
+    maybeAttachUnversionedSwiftName(S, D, Info);
 
   unsigned Selected = Info.getSelected().value_or(Info.size());
 
@@ -824,10 +877,18 @@ static void ProcessVersionedAPINotes(
     std::tie(Version, InfoSlice) = Info[i];
     auto Active = (i == Selected) ? IsActive_t::Active : IsActive_t::Inactive;
     auto Replacement = IsSubstitution_t::Original;
-    if (Active == IsActive_t::Inactive && Version.empty()) {
+
+    // When collection all APINotes as version-independent,
+    // capture all as inactive and defer to the client select the
+    // right one.
+    if (S.captureSwiftVersionIndependentAPINotes()) {
+      Active = IsActive_t::Inactive;
+      Replacement = IsSubstitution_t::Original;
+    } else if (Active == IsActive_t::Inactive && Version.empty()) {
       Replacement = IsSubstitution_t::Replacement;
       Version = Info[Selected].first;
     }
+
     ProcessAPINotes(S, D, InfoSlice,
                     VersionedInfoMetadata(Version, Active, Replacement));
   }
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index e27ed8fd4de14..01252a4bc69c6 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -263,7 +263,7 @@ static void DiagnoseCastQual(Sema &Self, const ExprResult &SrcExpr,
 // %2: Destination Type
 static TryCastResult TryLValueToRValueCast(Sema &Self, Expr *SrcExpr,
                                            QualType DestType, bool CStyle,
-                                           CastKind &Kind,
+                                           SourceRange OpRange, CastKind &Kind,
                                            CXXCastPath &BasePath,
                                            unsigned &msg);
 static TryCastResult
@@ -1425,8 +1425,8 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
   // C++11 [expr.static.cast]p3:
   //   A glvalue of type "cv1 T1" can be cast to type "rvalue reference to cv2
   //   T2" if "cv2 T2" is reference-compatible with "cv1 T1".
-  tcr = TryLValueToRValueCast(Self, SrcExpr.get(), DestType, CStyle, Kind,
-                              BasePath, msg);
+  tcr = TryLValueToRValueCast(Self, SrcExpr.get(), DestType, CStyle, OpRange,
+                              Kind, BasePath, msg);
   if (tcr != TC_NotApplicable)
     return tcr;
 
@@ -1602,8 +1602,8 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
 /// Tests whether a conversion according to N2844 is valid.
 TryCastResult TryLValueToRValueCast(Sema &Self, Expr *SrcExpr,
                                     QualType DestType, bool CStyle,
-                                    CastKind &Kind, CXXCastPath &BasePath,
-                                    unsigned &msg) {
+                                    SourceRange OpRange, CastKind &Kind,
+                                    CXXCastPath &BasePath, unsigned &msg) {
   // C++11 [expr.static.cast]p3:
   //   A glvalue of type "cv1 T1" can be cast to type "rvalue reference to
   //   cv2 T2" if "cv2 T2" is reference-compatible with "cv1 T1".
@@ -1616,7 +1616,6 @@ TryCastResult TryLValueToRValueCast(Sema &Self, Expr *SrcExpr,
 
   // Because we try the reference downcast before this function, from now on
   // this is the only cast possibility, so we issue an error if we fail now.
-  // FIXME: Should allow casting away constness if CStyle.
   QualType FromType = SrcExpr->getType();
   QualType ToType = R->getPointeeType();
   if (CStyle) {
@@ -1640,13 +1639,12 @@ TryCastResult TryLValueToRValueCast(Sema &Self, Expr *SrcExpr,
 
   if (RefConv & Sema::ReferenceConversions::DerivedToBase) {
     Kind = CK_DerivedToBase;
-    CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
-                       /*DetectVirtual=*/true);
-    if (!Self.IsDerivedFrom(SrcExpr->getBeginLoc(), SrcExpr->getType(),
-                            R->getPointeeType(), Paths))
-      return TC_NotApplicable;
-
-    Self.BuildBasePathArray(Paths, BasePath);
+    if (Self.CheckDerivedToBaseConversion(FromType, ToType,
+                                          SrcExpr->getBeginLoc(), OpRange,
+                                          &BasePath, CStyle)) {
+      msg = 0;
+      return TC_Failed;
+    }
   } else
     Kind = CK_NoOp;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 11cbda412667f..d7234e269f645 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -17155,6 +17155,30 @@ bool Sema::CheckEnumUnderlyingType(TypeSourceInfo *TI) {
   if (T->isDependentType())
     return false;
 
+  // C++0x 7.2p2: The type-specifier-seq of an enum-base shall name an
+  // integral type; any cv-qualification is ignored.
+  // C23 6.7.3.3p5: The underlying type of the enumeration is the unqualified,
+  // non-atomic version of the type specified by the type specifiers in the
+  // specifier qualifier list.
+  // Because of how odd C's rule is, we'll let the user know that operations
+  // involving the enumeration type will be non-atomic.
+  if (T->isAtomicType())
+    Diag(UnderlyingLoc, diag::warn_atomic_stripped_in_enum);
+
+  Qualifiers Q = T.getQualifiers();
+  std::optional<unsigned> QualSelect;
+  if (Q.hasConst() && Q.hasVolatile())
+    QualSelect = diag::CVQualList::Both;
+  else if (Q.hasConst())
+    QualSelect = diag::CVQualList::Const;
+  else if (Q.hasVolatile())
+    QualSelect = diag::CVQualList::Volatile;
+
+  if (QualSelect)
+    Diag(UnderlyingLoc, diag::warn_cv_stripped_in_enum) << *QualSelect;
+
+  T = T.getAtomicUnqualifiedType();
+
   // This doesn't use 'isIntegralType' despite the error message mentioning
   // integral type because isIntegralType would also allow enum types in C.
   if (const BuiltinType *BT = T->getAs<BuiltinType>())
@@ -17551,6 +17575,9 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     } else if (UnderlyingType.get()) {
       // C++0x 7.2p2: The type-specifier-seq of an enum-base shall name an
       // integral type; any cv-qualification is ignored.
+      // C23 6.7.3.3p5: The underlying type of the enumeration is the
+      // unqualified, non-atomic version of the type specified by the type
+      // specifiers in the specifier qualifier list.
       TypeSourceInfo *TI = nullptr;
       GetTypeFromParser(UnderlyingType.get(), &TI);
       EnumUnderlying = TI;
@@ -17563,6 +17590,18 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
                                           UPPC_FixedUnderlyingType))
         EnumUnderlying = Context.IntTy.getTypePtr();
 
+      // If the underlying type is atomic, we need to adjust the type before
+      // continuing. This only happens in the case we stored a TypeSourceInfo
+      // into EnumUnderlying because the other cases are error recovery up to
+      // this point. But because it's not possible to gin up a TypeSourceInfo
+      // for a non-atomic type from an atomic one, we'll store into the Type
+      // field instead. FIXME: it would be nice to have an easy way to get a
+      // derived TypeSourceInfo which strips qualifiers including the weird
+      // ones like _Atomic where it forms a different type.
+      if (TypeSourceInfo *TI = dyn_cast<TypeSourceInfo *>(EnumUnderlying);
+          TI && TI->getType()->isAtomicType())
+        EnumUnderlying = TI->getType().getAtomicUnqualifiedType().getTypePtr();
+
     } else if (Context.getTargetInfo().getTriple().isWindowsMSVCEnvironment()) {
       // For MSVC ABI compatibility, unfixed enums must use an underlying type
       // of 'int'. However, if this is an unfixed forward declaration, don't set
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 7ebb53318702c..099207727c8c8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -2912,18 +2912,14 @@ static void handleWarnUnusedResult(Sema &S, Decl *D, const ParsedAttr &AL) {
 
     // If this is spelled as the standard C++17 attribute, but not in C++17,
     // warn about using it as an extension. If there are attribute arguments,
-    // then claim it's a C++20 extension instead.
-    // FIXME: If WG14 does not seem likely to adopt the same feature, add an
-    // extension warning for C23 mode.
+    // then claim it's a C++20 extension instead. C23 supports this attribute
+    // with the message; no extension warning is needed there beyond the one
+    // already issued for accepting attributes in older modes.
     const LangOptions &LO = S.getLangOpts();
     if (AL.getNumArgs() == 1) {
       if (LO.CPlusPlus && !LO.CPlusPlus20)
         S.Diag(AL.getLoc(), diag::ext_cxx20_attr) << AL;
 
-      // Since this is spelled [[nodiscard]], get the optional string
-      // literal. If in C++ mode, but not in C++20 mode, diagnose as an
-      // extension.
-      // FIXME: C23 should support this feature as well, even as an extension.
       if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, nullptr))
         return;
     } else if (LO.CPlusPlus && !LO.CPlusPlus17)
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a30acbe9a4bca..4ecc9b0d4c5c8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6620,6 +6620,8 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       case OMPC_affinity:
       case OMPC_bind:
       case OMPC_filter:
+      case OMPC_severity:
+      case OMPC_message:
         continue;
       case OMPC_allocator:
       case OMPC_flush:
@@ -6637,8 +6639,6 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       case OMPC_match:
       case OMPC_when:
       case OMPC_at:
-      case OMPC_severity:
-      case OMPC_message:
       default:
         llvm_unreachable("Unexpected clause");
       }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 7af3acacb5ba6..1b54628c5e564 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -13131,7 +13131,8 @@ CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
     ParamTypes =
         Cand->Function->getType()->castAs<FunctionProtoType>()->getParamTypes();
     if (isa<CXXMethodDecl>(Cand->Function) &&
-        !isa<CXXConstructorDecl>(Cand->Function) && !Reversed) {
+        !isa<CXXConstructorDecl>(Cand->Function) && !Reversed &&
+        !Cand->Function->hasCXXExplicitFunctionObjectParameter()) {
       // Conversion 0 is 'this', which doesn't have a corresponding parameter.
       ConvIdx = 1;
       if (CSK == OverloadCandidateSet::CSK_Operator &&
@@ -13149,9 +13150,8 @@ CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
 
   // Fill in the rest of the conversions.
   for (unsigned ParamIdx = Reversed ? ParamTypes.size() - 1 : 0;
-       ConvIdx != ConvCount;
+       ConvIdx != ConvCount && ArgIdx < Args.size();
        ++ConvIdx, ++ArgIdx, ParamIdx += (Reversed ? -1 : 1)) {
-    assert(ArgIdx < Args.size() && "no argument for this arg conversion");
     if (Cand->Conversions[ConvIdx].isInitialized()) {
       // We've already checked this conversion.
     } else if (ParamIdx < ParamTypes.size()) {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 1a98b3583185e..b76619fc50268 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -8968,8 +8968,10 @@ Sema::ActOnFinishConceptDefinition(Scope *S, ConceptDecl *C,
                                    Expr *ConstraintExpr,
                                    const ParsedAttributesView &Attrs) {
   assert(!C->hasDefinition() && "Concept already defined");
-  if (DiagnoseUnexpandedParameterPack(ConstraintExpr))
+  if (DiagnoseUnexpandedParameterPack(ConstraintExpr)) {
+    C->setInvalidDecl();
     return nullptr;
+  }
   C->setDefinition(ConstraintExpr);
   ProcessDeclAttributeList(S, C, Attrs);
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 70a4c159f9805..e2c3cdcd536bc 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2022,8 +2022,17 @@ Decl *TemplateDeclInstantiator::VisitEnumDecl(EnumDecl *D) {
                                                 DeclarationName());
       if (!NewTI || SemaRef.CheckEnumUnderlyingType(NewTI))
         Enum->setIntegerType(SemaRef.Context.IntTy);
-      else
-        Enum->setIntegerTypeSourceInfo(NewTI);
+      else {
+        // If the underlying type is atomic, we need to adjust the type before
+        // continuing. See C23 6.7.3.3p5 and Sema::ActOnTag(). FIXME: same as
+        // within ActOnTag(), it would be nice to have an easy way to get a
+        // derived TypeSourceInfo which strips qualifiers including the weird
+        // ones like _Atomic where it forms a different type.
+        if (NewTI->getType()->isAtomicType())
+          Enum->setIntegerType(NewTI->getType().getAtomicUnqualifiedType());
+        else
+          Enum->setIntegerTypeSourceInfo(NewTI);
+      }
 
       // C++23 [conv.prom]p4
       // if integral promotion can be applied to its underlying type, a prvalue
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
index 2437b2d3595e5..b641e4a0f0abb 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -117,7 +117,6 @@ DependencyScanningFilesystemSharedCache::getOutOfDateEntries(
     std::lock_guard<std::mutex> LockGuard(Shard.CacheLock);
     for (const auto &[Path, CachedPair] : Shard.CacheByFilename) {
       const CachedFileSystemEntry *Entry = CachedPair.first;
-
       llvm::ErrorOr<llvm::vfs::Status> Status = UnderlyingFS.status(Path);
       if (Status) {
         if (Entry->getError()) {
@@ -128,12 +127,22 @@ DependencyScanningFilesystemSharedCache::getOutOfDateEntries(
           InvalidDiagInfo.emplace_back(Path.data());
         } else {
           llvm::vfs::Status CachedStatus = Entry->getStatus();
-          uint64_t CachedSize = CachedStatus.getSize();
-          uint64_t ActualSize = Status->getSize();
-          if (CachedSize != ActualSize) {
-            // This is the case where the cached file has a different size
-            // from the actual file that comes from the underlying FS.
-            InvalidDiagInfo.emplace_back(Path.data(), CachedSize, ActualSize);
+          if (Status->getType() == llvm::sys::fs::file_type::regular_file &&
+              Status->getType() == CachedStatus.getType()) {
+            // We only check regular files. Directory files sizes could change
+            // due to content changes, and reporting directory size changes can
+            // lead to false positives.
+            // TODO: At the moment, we do not detect symlinks to files whose
+            // size may change. We need to decide if we want to detect cached
+            // symlink size changes. We can also expand this to detect file
+            // type changes.
+            uint64_t CachedSize = CachedStatus.getSize();
+            uint64_t ActualSize = Status->getSize();
+            if (CachedSize != ActualSize) {
+              // This is the case where the cached file has a different size
+              // from the actual file that comes from the underlying FS.
+              InvalidDiagInfo.emplace_back(Path.data(), CachedSize, ActualSize);
+            }
           }
         }
       }
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
index 515211d47b348..27734ffd0e20b 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
@@ -40,6 +40,7 @@ class MakeDependencyPrinterConsumer : public DependencyConsumer {
   void handlePrebuiltModuleDependency(PrebuiltModuleDep PMD) override {}
   void handleModuleDependency(ModuleDeps MD) override {}
   void handleDirectModuleDependency(ModuleID ID) override {}
+  void handleVisibleModule(std::string ModuleName) override {}
   void handleContextHash(std::string Hash) override {}
 
   void printDependencies(std::string &S) {
@@ -154,7 +155,8 @@ DependencyScanningTool::getTranslationUnitDependencies(
   return Consumer.takeTranslationUnitDeps();
 }
 
-llvm::Expected<ModuleDepsGraph> DependencyScanningTool::getModuleDependencies(
+llvm::Expected<TranslationUnitDeps>
+DependencyScanningTool::getModuleDependencies(
     StringRef ModuleName, const std::vector<std::string> &CommandLine,
     StringRef CWD, const llvm::DenseSet<ModuleID> &AlreadySeen,
     LookupModuleOutputCallback LookupModuleOutput) {
@@ -164,7 +166,7 @@ llvm::Expected<ModuleDepsGraph> DependencyScanningTool::getModuleDependencies(
                                                   Controller, ModuleName);
   if (Result)
     return std::move(Result);
-  return Consumer.takeModuleGraphDeps();
+  return Consumer.takeTranslationUnitDeps();
 }
 
 TranslationUnitDeps FullDependencyConsumer::takeTranslationUnitDeps() {
@@ -175,6 +177,7 @@ TranslationUnitDeps FullDependencyConsumer::takeTranslationUnitDeps() {
   TU.NamedModuleDeps = std::move(NamedModuleDeps);
   TU.FileDeps = std::move(Dependencies);
   TU.PrebuiltModuleDeps = std::move(PrebuiltModuleDeps);
+  TU.VisibleModules = std::move(VisibleModules);
   TU.Commands = std::move(Commands);
 
   for (auto &&M : ClangModuleDeps) {
@@ -190,19 +193,4 @@ TranslationUnitDeps FullDependencyConsumer::takeTranslationUnitDeps() {
   return TU;
 }
 
-ModuleDepsGraph FullDependencyConsumer::takeModuleGraphDeps() {
-  ModuleDepsGraph ModuleGraph;
-
-  for (auto &&M : ClangModuleDeps) {
-    auto &MD = M.second;
-    // TODO: Avoid handleModuleDependency even being called for modules
-    //   we've already seen.
-    if (AlreadySeen.count(M.first))
-      continue;
-    ModuleGraph.push_back(std::move(MD));
-  }
-
-  return ModuleGraph;
-}
-
 CallbackActionController::~CallbackActionController() {}
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index fa86d714ff69a..37f8b945d785e 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -673,8 +673,10 @@ void ModuleDepCollectorPP::handleImport(const Module *Imported) {
   if (MDC.isPrebuiltModule(TopLevelModule))
     MDC.DirectPrebuiltModularDeps.insert(
         {TopLevelModule, PrebuiltModuleDep{TopLevelModule}});
-  else
+  else {
     MDC.DirectModularDeps.insert(TopLevelModule);
+    MDC.DirectImports.insert(Imported);
+  }
 }
 
 void ModuleDepCollectorPP::EndOfMainFile() {
@@ -706,6 +708,8 @@ void ModuleDepCollectorPP::EndOfMainFile() {
     if (!MDC.isPrebuiltModule(M))
       MDC.DirectModularDeps.insert(M);
 
+  MDC.addVisibleModules();
+
   for (const Module *M : MDC.DirectModularDeps)
     handleTopLevelModule(M);
 
@@ -727,6 +731,9 @@ void ModuleDepCollectorPP::EndOfMainFile() {
       MDC.Consumer.handleDirectModuleDependency(It->second->ID);
   }
 
+  for (auto &&I : MDC.VisibleModules)
+    MDC.Consumer.handleVisibleModule(std::string(I.getKey()));
+
   for (auto &&I : MDC.FileDeps)
     MDC.Consumer.handleFileDependency(I);
 
@@ -993,6 +1000,29 @@ bool ModuleDepCollector::isPrebuiltModule(const Module *M) {
   return true;
 }
 
+void ModuleDepCollector::addVisibleModules() {
+  llvm::DenseSet<const Module *> ImportedModules;
+  auto InsertVisibleModules = [&](const Module *M) {
+    if (ImportedModules.contains(M))
+      return;
+
+    VisibleModules.insert(M->getTopLevelModuleName());
+    SmallVector<Module *> Stack;
+    M->getExportedModules(Stack);
+    while (!Stack.empty()) {
+      const Module *CurrModule = Stack.pop_back_val();
+      if (ImportedModules.contains(CurrModule))
+        continue;
+      ImportedModules.insert(CurrModule);
+      VisibleModules.insert(CurrModule->getTopLevelModuleName());
+      CurrModule->getExportedModules(Stack);
+    }
+  };
+
+  for (const Module *Import : DirectImports)
+    InsertVisibleModules(Import);
+}
+
 static StringRef makeAbsoluteAndPreferred(CompilerInstance &CI, StringRef Path,
                                           SmallVectorImpl<char> &Storage) {
   if (llvm::sys::path::is_absolute(Path) &&
diff --git a/clang/test/APINotes/versioned-version-independent.m b/clang/test/APINotes/versioned-version-independent.m
new file mode 100644
index 0000000000000..da8b34a1d9ba3
--- /dev/null
+++ b/clang/test/APINotes/versioned-version-independent.m
@@ -0,0 +1,36 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// Build and check the module file in version-independent mode.
+// RUN: %clang_cc1 -fswift-version-independent-apinotes -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -fswift-version-independent-apinotes -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' &> %t/VersionedKit_AST_Dump.txt
+// RUN: cat %t/VersionedKit_AST_Dump.txt | FileCheck -check-prefix=CHECK-VERSIONED-DUMP %s
+
+#import <VersionedKit/VersionedKit.h>
+
+// CHECK-VERSIONED-DUMP-LABEL: Dumping moveToPointDUMP
+// CHECK-VERSIONED-DUMP: SwiftNameAttr {{.+}} "moveTo(x:y:)"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "moveTo(a:b:)"
+
+// CHECK-VERSIONED-DUMP-LABEL: Dumping unversionedRenameDUMP
+// CHECK-VERSIONED-DUMP: SwiftNameAttr {{.+}} "unversionedRename_HEADER()"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 0
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "unversionedRename_NOTES()"
+
+// CHECK-VERSIONED-DUMP-LABEL: Dumping TestGenericDUMP
+// CHECK-VERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0
+// CHECK-VERSIONED-DUMP-NEXT: SwiftImportAsNonGenericAttr {{.+}} <<invalid sloc>>
+
+// CHECK-VERSIONED-DUMP:  Swift3RenamedOnlyDUMP
+// CHECK-VERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Name"
+
+// CHECK-VERSIONED-DUMP: Swift3RenamedAlsoDUMP
+// CHECK-VERSIONED-DUMP: SwiftNameAttr {{.+}} "Swift4Name"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Also"
+
+// CHECK-VERSIONED-DUMP: Swift4RenamedDUMP
+// CHECK-VERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 4
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift4Name"
+
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index b74e183eec9cc..1c8b9c10f5a98 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -12,7 +12,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-COUNTER,CHECK-LOAD %s
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
 // RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
@@ -36,7 +36,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-LOAD %s
 
 // This test tests two different AST generations for each structured buffer.
 // The "EMPTY" test mode verifies the AST generated by forward declaration
@@ -170,22 +170,22 @@ RESOURCE<float> Buffer;
 // CHECK-SUBSCRIPT-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int'
 // CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
-// CHECK-SUBSCRIPT-NEXT: CXXMethodDecl {{.*}} operator[] 'hlsl_device element_type &(unsigned int)'
-// CHECK-SUBSCRIPT-NEXT: ParmVarDecl {{.*}} Index 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: CompoundStmt
-// CHECK-SUBSCRIPT-NEXT: ReturnStmt
-// CHECK-SUBSCRIPT-NEXT: UnaryOperator {{.*}} 'hlsl_device element_type' prefix '*' cannot overflow
-// CHECK-SUBSCRIPT-NEXT: CallExpr {{.*}} 'hlsl_device element_type *'
-// CHECK-SUBSCRIPT-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-SUBSCRIPT-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle {{.*}}
-// CHECK-SUBSCRIPT-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+// CHECK-SUBSCRIPT-UAV-NEXT: CXXMethodDecl {{.*}} operator[] 'hlsl_device element_type &(unsigned int)'
+// CHECK-SUBSCRIPT-UAV-NEXT: ParmVarDecl {{.*}} Index 'unsigned int'
+// CHECK-SUBSCRIPT-UAV-NEXT: CompoundStmt
+// CHECK-SUBSCRIPT-UAV-NEXT: ReturnStmt
+// CHECK-SUBSCRIPT-UAV-NEXT: UnaryOperator {{.*}} 'hlsl_device element_type' prefix '*' cannot overflow
+// CHECK-SUBSCRIPT-UAV-NEXT: CallExpr {{.*}} 'hlsl_device element_type *'
+// CHECK-SUBSCRIPT-UAV-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
+// CHECK-SUBSCRIPT-UAV-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-SUBSCRIPT-UAV-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
+// CHECK-SUBSCRIPT-UAV-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-SUBSCRIPT-UAV-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SUBSCRIPT-UAV-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-SUBSCRIPT-UAV-SAME: ' lvalue .__handle {{.*}}
+// CHECK-SUBSCRIPT-UAV-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-SUBSCRIPT-UAV-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'Index' 'unsigned int'
+// CHECK-SUBSCRIPT-UAV-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const'
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'hlsl_device element_type &(unsigned int)'
diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
index d098e5a323ca7..d6b88e276762e 100644
--- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
@@ -126,7 +126,7 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
 // CHECK-NEXT: AlwaysInlineAttr
 
-// Subsctript operators
+// Subscript operators
 
 // CHECK: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const'
 // CHECK-NEXT: ParmVarDecl {{.*}} Index 'unsigned int'
@@ -145,22 +145,21 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'Index' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
-// CHECK-NEXT: CXXMethodDecl {{.*}} operator[] 'hlsl_device element_type &(unsigned int)'
-// CHECK-NEXT: ParmVarDecl {{.*}} Index 'unsigned int'
-// CHECK-NEXT: CompoundStmt
-// CHECK-NEXT: ReturnStmt
-// CHECK-NEXT: UnaryOperator {{.*}} 'hlsl_device element_type' prefix '*' cannot overflow
-// CHECK-NEXT: CallExpr {{.*}} 'hlsl_device element_type *'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}}  '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
+// CHECK-UAV-NEXT: CXXMethodDecl {{.*}} operator[] 'hlsl_device element_type &(unsigned int)'
+// CHECK-UAV-NEXT: ParmVarDecl {{.*}} Index 'unsigned int'
+// CHECK-UAV-NEXT: CompoundStmt
+// CHECK-UAV-NEXT: ReturnStmt
+// CHECK-UAV-NEXT: UnaryOperator {{.*}} 'hlsl_device element_type' prefix '*' cannot overflow
+// CHECK-UAV-NEXT: CallExpr {{.*}} 'hlsl_device element_type *'
+// CHECK-UAV-NEXT: ImplicitCastExpr {{.*}} <BuiltinFnToFnPtr>
+// CHECK-UAV-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}}  '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-UAV-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
 // CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ' lvalue .__handle {{.*}}
-// CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'Index' 'unsigned int'
-// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+// CHECK-UAV-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-UAV-SAME: ' lvalue .__handle {{.*}}
+// CHECK-UAV-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-UAV-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'Index' 'unsigned int'
+// CHECK-UAV-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
 // Load method
 
diff --git a/clang/test/C/C23/n3030.c b/clang/test/C/C23/n3030.c
index 17084bbb55f50..94ea7037edd11 100644
--- a/clang/test/C/C23/n3030.c
+++ b/clang/test/C/C23/n3030.c
@@ -91,3 +91,19 @@ enum e : short f = 0; // expected-error {{non-defining declaration of enumeratio
 enum g : short { yyy } h = yyy;
 
 enum ee2 : typeof ((enum ee3 : short { A })0, (short)0);
+
+enum not_actually_atomic : _Atomic(short) { // expected-error {{'_Atomic' qualifier ignored; operations involving the enumeration type will be non-atomic}}
+  Surprise
+};
+
+enum not_actually_const : const int { // expected-warning {{'const' qualifier in enumeration underlying type ignored}}
+  SurpriseAgain
+};
+
+enum not_actually_volatile : volatile int { // expected-warning {{'volatile' qualifier in enumeration underlying type ignored}}
+  SurpriseOnceMore
+};
+
+enum not_acually_const_or_volatile : const volatile int { // expected-warning {{'const' and 'volatile' qualifiers in enumeration underlying type ignored}}
+  WhyTheSurprise
+};
diff --git a/clang/test/C/C23/n3030_1.c b/clang/test/C/C23/n3030_1.c
new file mode 100644
index 0000000000000..1afc9855767f0
--- /dev/null
+++ b/clang/test/C/C23/n3030_1.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -std=c23 -Wno-underlying-atomic-qualifier-ignored -ast-dump %s | FileCheck %s
+
+// The underlying type is the unqualified, non-atomic version of the type
+// specified.
+enum const_enum : const short { ConstE };
+// CHECK: EnumDecl {{.*}} const_enum 'short'
+
+// These were previously being diagnosed as invalid underlying types. They
+// are valid; the _Atomic is stripped from the underlying type.
+enum atomic_enum1 : _Atomic(int) { AtomicE1 };
+// CHECK: EnumDecl {{.*}} atomic_enum1 'int'
+enum atomic_enum2 : _Atomic long long { AtomicE2 };
+// CHECK: EnumDecl {{.*}} atomic_enum2 'long long'
diff --git a/clang/test/CIR/CodeGen/bitfields.c b/clang/test/CIR/CodeGen/bitfields.c
index ee69db22b4a20..fc688fb4cdcaa 100644
--- a/clang/test/CIR/CodeGen/bitfields.c
+++ b/clang/test/CIR/CodeGen/bitfields.c
@@ -134,3 +134,136 @@ unsigned int load_field_unsigned(A* s) {
 //OGCG:   [[TMP4:%.*]] = lshr i16 [[TMP3]], 3
 //OGCG:   [[TMP5:%.*]] = and i16 [[TMP4]], 15
 //OGCG:   [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
+
+void store_field() {
+  S s;
+  s.e = 3;
+}
+// CIR: cir.func {{.*@store_field}}
+// CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>
+// CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
+// CIR:   [[TMP2:%.*]] = cir.get_member [[TMP0]][1] {name = "e"} : !cir.ptr<!rec_S> -> !cir.ptr<!u16i>
+// CIR:   cir.set_bitfield(#bfi_e, [[TMP2]] : !cir.ptr<!u16i>, [[TMP1]] : !s32i)
+
+// LLVM: define dso_local void @store_field()
+// LLVM:   [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
+// LLVM:   [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 1
+// LLVM:   [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 2
+// LLVM:   [[TMP3:%.*]] = and i16 [[TMP2]], -32768
+// LLVM:   [[TMP4:%.*]] = or i16 [[TMP3]], 3
+// LLVM:   store i16 [[TMP4]], ptr [[TMP1]], align 2
+
+// OGCG: define dso_local void @store_field()
+// OGCG:   [[TMP0:%.*]] = alloca %struct.S, align 4
+// OGCG:   [[TMP1:%.*]] = getelementptr inbounds nuw %struct.S, ptr [[TMP0]], i32 0, i32 1
+// OGCG:   [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 4
+// OGCG:   [[TMP3:%.*]] = and i16 [[TMP2]], -32768
+// OGCG:   [[TMP4:%.*]] = or i16 [[TMP3]], 3
+// OGCG:   store i16 [[TMP4]], ptr [[TMP1]], align 4
+
+void store_bitfield_to_bitfield() {
+  S s;
+  s.a = s.c;
+}
+
+// CIR: cir.func {{.*@store_bitfield_to_bitfield}}
+// CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s"] {alignment = 4 : i64}
+// CIR:   [[TMP1:%.*]] = cir.get_member [[TMP0]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
+// CIR:   [[TMP2:%.*]] = cir.get_bitfield(#bfi_c, [[TMP1]] : !cir.ptr<!u64i>) -> !s32i
+// CIR:   [[TMP3:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_a, [[TMP3]] : !cir.ptr<!u64i>, [[TMP2]] : !s32i) -> !s32i
+
+// LLVM: define dso_local void @store_bitfield_to_bitfield()
+// LLVM:  [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
+// LLVM:  [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
+// LLVM:  [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// LLVM:  [[TMP3:%.*]] = shl i64 [[TMP2]], 15
+// LLVM:  [[TMP4:%.*]] = ashr i64 [[TMP3]], 47
+// LLVM:  [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+// LLVM:  [[TMP6:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
+// LLVM:  [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+// LLVM:  [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+// LLVM:  [[TMP9:%.*]] = and i64 [[TMP7]], 15
+// LLVM:  [[TMP10:%.*]] = and i64 [[TMP8]], -16
+// LLVM:  [[TMP11:%.*]] = or i64 [[TMP10]], [[TMP9]]
+// LLVM:  store i64 [[TMP11]], ptr [[TMP6]], align 8
+// LLVM:  [[TMP12:%.*]] = shl i64 [[TMP9]], 60
+// LLVM:  [[TMP13:%.*]] = ashr i64 [[TMP12]], 60
+// LLVM:  [[TMP15:%.*]] = trunc i64 [[TMP13]] to i32
+
+// OGCG: define dso_local void @store_bitfield_to_bitfield()
+// OGCG:  [[TMP0:%.*]] = alloca %struct.S, align 4
+// OGCG:  [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4
+// OGCG:  [[TMP2:%.*]] = shl i64 [[TMP1]], 15
+// OGCG:  [[TMP3:%.*]] = ashr i64 [[TMP2]], 47
+// OGCG:  [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+// OGCG:  [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// OGCG:  [[TMP6:%.*]] = load i64, ptr [[TMP0]], align 4
+// OGCG:  [[TMP7:%.*]] = and i64 [[TMP5]], 15
+// OGCG:  [[TMP8:%.*]] = and i64 [[TMP6]], -16
+// OGCG:  [[TMP9:%.*]] = or i64 [[TMP8]], [[TMP7]]
+// OGCG:  store i64 [[TMP9]], ptr [[TMP0]], align 4
+// OGCG:  [[TMP10:%.*]] = shl i64 %bf.value, 60
+// OGCG:  [[TMP11:%.*]] = ashr i64 [[TMP10]], 60
+// OGCG:  [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
+
+typedef struct {
+  int a : 30;
+  int volatile b : 8;
+  int c;
+} V;
+
+void get_volatile(V* v) {
+  v->b = 3;
+}
+
+// CIR: cir.func dso_local @get_volatile
+// CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_V>, !cir.ptr<!cir.ptr<!rec_V>>, ["v", init] {alignment = 8 : i64}
+// CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
+// CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_V>>, !cir.ptr<!rec_V>
+// CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_V> -> !cir.ptr<!u64i>
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
+
+// LLVM: define dso_local void @get_volatile
+// LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
+// LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// LLVM:   [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0
+// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481
+// LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888
+// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 8
+
+// OCGC: define dso_local void @get_volatile
+// OCGC:   [[TMP0:%.*]] = alloca ptr, align 8
+// OCGC:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// OCGC:   [[TMP2:%.*]] = load volatile i64, ptr [[TMP1]], align 4
+// OCGC:   [[TMP3:%.*]] = and i64 [[TMP2]], -1095216660481
+// OCGC:   [[TMP4:%.*]] = or i64 [[TMP3]], 12884901888
+// OCGC:   store volatile i64 [[TMP4]], ptr [[TMP1]], align 4
+
+void set_volatile(V* v) {
+  v->b = 3;
+}
+//CIR: cir.func dso_local @set_volatile
+//CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_V>, !cir.ptr<!cir.ptr<!rec_V>>, ["v", init] {alignment = 8 : i64}
+//CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
+//CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_V>>, !cir.ptr<!rec_V>
+//CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_V> -> !cir.ptr<!u64i>
+//CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
+
+// LLVM: define dso_local void @set_volatile
+// LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
+// LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// LLVM:   [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0
+// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481
+// LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888
+// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 8
+
+// OGCG: define dso_local void @set_volatile
+// OGCG:   [[TMP0:%.*]] = alloca ptr, align 8
+// OGCG:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// OGCG:   [[TMP2:%.*]] = load volatile i64, ptr [[TMP1]], align 4
+// OGCG:   [[TMP3:%.*]] = and i64 [[TMP2]], -1095216660481
+// OGCG:   [[TMP4:%.*]] = or i64 [[TMP3]], 12884901888
+// OGCG:   store volatile i64 [[TMP4]], ptr [[TMP1]], align 4
diff --git a/clang/test/CIR/CodeGen/bitfields.cpp b/clang/test/CIR/CodeGen/bitfields.cpp
index 7372acaeb9e06..6715ebf1f48b6 100644
--- a/clang/test/CIR/CodeGen/bitfields.cpp
+++ b/clang/test/CIR/CodeGen/bitfields.cpp
@@ -58,3 +58,70 @@ int load_field(S* s) {
 // OGCG:  [[TMP3:%.*]] = shl i64 [[TMP2]], 15
 // OGCG:  [[TMP4:%.*]] = ashr i64 [[TMP3]], 47
 // OGCG:  [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+
+void store_field() {
+  S s;
+  s.a = 3;
+}
+// CIR: cir.func dso_local @_Z11store_field
+// CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>
+// CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
+// CIR:   [[TMP2:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
+// CIR:   cir.set_bitfield(#bfi_a, [[TMP2]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i)
+
+// LLVM: define dso_local void @_Z11store_fieldv
+// LLVM:   [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
+// LLVM:   [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
+// LLVM:   [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// LLVM:   [[TMP3:%.*]] = and i64 [[TMP2]], -16
+// LLVM:   [[TMP4:%.*]] = or i64 [[TMP3]], 3
+// LLVM:   store i64 [[TMP4]], ptr [[TMP1]], align 8
+
+// OGCG: define dso_local void @_Z11store_fieldv()
+// OGCG:   [[TMP0:%.*]] = alloca %struct.S, align 4
+// OGCG:   [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4
+// OGCG:   [[TMP2:%.*]] = and i64 [[TMP1]], -16
+// OGCG:   [[TMP3:%.*]] = or i64 [[TMP2]], 3
+// OGCG:   store i64 [[TMP3]], ptr [[TMP0]], align 4
+
+void store_bitfield_to_bitfield(S* s) {
+  s->a = s->b = 3;
+}
+
+// CIR: cir.func dso_local @_Z26store_bitfield_to_bitfieldP1S
+// CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init] {alignment = 8 : i64}
+// CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
+// CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+// CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) -> !s32i
+// CIR:   [[TMP5:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+// CIR:   [[TMP6:%.*]] = cir.get_member [[TMP5]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
+// CIR:   [[TMP7:%.*]] = cir.set_bitfield(#bfi_a, [[TMP6]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i) -> !s32i
+
+// LLVM: define dso_local void @_Z26store_bitfield_to_bitfieldP1S
+// LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
+// LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// LLVM:   [[TMP2:%.*]] = getelementptr %struct.S, ptr [[TMP1]], i32 0, i32 0
+// LLVM:   [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -2147483633
+// LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 48
+// LLVM:   store i64 [[TMP5]], ptr [[TMP2]], align 8
+// LLVM:   [[TMP6:%.*]] = load ptr, ptr [[TMP0]], align 8
+// LLVM:   [[TMP7:%.*]] = getelementptr %struct.S, ptr [[TMP6]], i32 0, i32 0
+// LLVM:   [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8
+// LLVM:   [[TMP9:%.*]] = and i64 [[TMP8]], -16
+// LLVM:   [[TMP10:%.*]] = or i64 [[TMP9]], 3
+// LLVM:   store i64 [[TMP10]], ptr [[TMP7]], align 8
+
+// OGCG: define dso_local void @_Z26store_bitfield_to_bitfieldP1S
+// OGCG:   [[TMP0:%.*]] = alloca ptr, align 8
+// OGCG:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// OGCG:   [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4
+// OGCG:   [[TMP3:%.*]] = and i64 [[TMP2]], -2147483633
+// OGCG:   [[TMP4:%.*]] = or i64 [[TMP3]], 48
+// OGCG:   store i64 [[TMP4]], ptr [[TMP1]], align 4
+// OGCG:   [[TMP5:%.*]] = load ptr, ptr [[TMP0]], align 8
+// OGCG:   [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+// OGCG:   [[TMP7:%.*]] = and i64 [[TMP6]], -16
+// OGCG:   [[TMP8:%.*]] = or i64 [[TMP7]], 3
+// OGCG:   store i64 [[TMP8]], ptr [[TMP5]], align 4
diff --git a/clang/test/CIR/CodeGen/bitfields_be.c b/clang/test/CIR/CodeGen/bitfields_be.c
index e839bc2b9698d..6133927b67d21 100644
--- a/clang/test/CIR/CodeGen/bitfields_be.c
+++ b/clang/test/CIR/CodeGen/bitfields_be.c
@@ -42,3 +42,73 @@ int init(S* s) {
 //OGCG:   [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 //OGCG:   [[TMP3:%.*]] = shl i32 [[TMP2]], 15
 //OGCG:   [[TMP4:%.*]] = ashr i32 [[TMP3]], 15
+
+
+void load(S* s) {
+    s->a = -4;
+    s->b = 42;
+    s->c = -12345;
+}
+
+// field 'a'
+// CIR: cir.func dso_local @load
+// CIR:    %[[PTR0:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init] {alignment = 8 : i64} loc(#loc35)
+// CIR:    %[[CONST1:.*]] = cir.const #cir.int<4> : !s32i
+// CIR:    %[[MIN1:.*]] = cir.unary(minus, %[[CONST1]]) nsw : !s32i, !s32i
+// CIR:    %[[VAL0:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+// CIR:    %[[GET0:.*]] = cir.get_member %[[VAL0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
+// CIR:    %[[SET0:.*]] = cir.set_bitfield(#bfi_a, %[[GET0]] : !cir.ptr<!u32i>, %[[MIN1]] : !s32i) -> !s32i
+
+// LLVM: define dso_local void @load
+// LLVM:   %[[PTR0:.*]] = load ptr
+// LLVM:   %[[GET0:.*]] = getelementptr %struct.S, ptr %[[PTR0]], i32 0, i32 0
+// LLVM:   %[[VAL0:.*]] = load i32, ptr %[[GET0]], align 4
+// LLVM:   %[[AND0:.*]] = and i32 %[[VAL0]], 268435455
+// LLVM:   %[[OR0:.*]] = or i32 %[[AND0]], -1073741824
+// LLVM:   store i32 %[[OR0]], ptr %[[GET0]]
+
+// OGCG: define dso_local void @load
+// OGCG:   %[[PTR0:.*]] = load ptr
+// OGCG:   %[[VAL0:.*]] = load i32, ptr %[[PTR0]]
+// OGCG:   %[[AND0:.*]] = and i32 %[[VAL0]], 268435455
+// OGCG:   %[[OR0:.*]] = or i32 %[[AND0]], -1073741824
+// OGCG:   store i32 %[[OR0]], ptr %[[PTR0]]
+
+// field 'b'
+// CIR:    %[[CONST2:.*]] = cir.const #cir.int<42> : !s32i
+// CIR:    %[[VAL1:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+// CIR:    %[[GET1:.*]] = cir.get_member %[[VAL1]][0] {name = "b"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
+// CIR:    %[[SET1:.*]] = cir.set_bitfield(#bfi_b, %[[GET1]] : !cir.ptr<!u32i>, %[[CONST2]] : !s32i) -> !s32i
+
+// LLVM:  %[[PTR1:.*]] = load ptr
+// LLVM:  %[[GET1:.*]] = getelementptr %struct.S, ptr %[[PTR1]], i32 0, i32 0
+// LLVM:  %[[VAL1:.*]] = load i32, ptr %[[GET1]], align 4
+// LLVM:  %[[AND1:.*]] = and i32 %[[VAL1]], -268304385
+// LLVM:  %[[OR1:.*]] = or i32 %[[AND1]], 5505024
+// LLVM:  store i32 %[[OR1]], ptr %[[GET1]]
+
+// OGCG:   %[[PTR1:.*]] = load ptr
+// OGCG:   %[[VAL1:.*]] = load i32, ptr %[[PTR1]]
+// OGCG:   %[[AND1:.*]] = and i32 %[[VAL1]], -268304385
+// OGCG:   %[[OR1:.*]] = or i32 %[[AND1]], 5505024
+// OGCG:   store i32 %[[OR1]], ptr %[[PTR1]]
+
+// field 'c'
+// CIR:    %[[CONST3:.*]] = cir.const #cir.int<12345> : !s32i
+// CIR:    %[[MIN2:.*]] = cir.unary(minus, %[[CONST3]]) nsw : !s32i, !s32i
+// CIR:    %[[VAL2:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
+// CIR:    %[[GET2:.*]] = cir.get_member %[[VAL2]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
+// CIR:    %[[SET2:.*]] = cir.set_bitfield(#bfi_c, %[[GET2]] : !cir.ptr<!u32i>, %[[MIN2]] : !s32i) -> !s32i
+
+// LLVM:  %[[PTR2:.*]] = load ptr
+// LLVM:  %[[GET2:.*]] = getelementptr %struct.S, ptr  %[[PTR2]], i32 0, i32 0
+// LLVM:  %[[VAL2:.*]] = load i32, ptr %[[GET2]], align 4
+// LLVM:  %[[AND2:.*]] = and i32 %[[VAL2]], -131072
+// LLVM:  %[[OR2:.*]] = or i32 %[[AND2]], 118727
+// LLVM:  store i32 %[[OR2]], ptr %[[GET2]]
+
+// OGCG:   %[[PTR2:.*]] = load ptr
+// OGCG:   %[[VAL2:.*]] = load i32, ptr %[[PTR2]]
+// OGCG:   %[[AND2:.*]] = and i32 %[[VAL2]], -131072
+// OGCG:   %[[OR2:.*]] = or i32 %[[AND2]], 118727
+// OGCG:   store i32 %[[OR2]], ptr %[[PTR2]]
diff --git a/clang/test/CIR/CodeGen/complex-arithmetic.cpp b/clang/test/CIR/CodeGen/complex-arithmetic.cpp
new file mode 100644
index 0000000000000..5131c075744c8
--- /dev/null
+++ b/clang/test/CIR/CodeGen/complex-arithmetic.cpp
@@ -0,0 +1,160 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void foo() {
+  int _Complex a;
+  int _Complex b;
+  int _Complex c = a + b;
+}
+
+// CIR: %[[COMPLEX_A:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR: %[[COMPLEX_B:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[COMPLEX_A]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[COMPLEX_B]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: %[[ADD:.*]] = cir.complex.add %[[TMP_A]], %[[TMP_B]] : !cir.complex<!s32i>
+
+// LLVM: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { i32, i32 }, ptr %[[COMPLEX_A]], align 4
+// LLVM: %[[TMP_B:.*]] = load { i32, i32 }, ptr %[[COMPLEX_B]], align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_A]], 1
+// LLVM: %[[B_REAL:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { i32, i32 } %[[TMP_B]], 1
+// LLVM: %[[ADD_REAL:.*]] = add i32 %[[A_REAL]], %[[B_REAL]]
+// LLVM: %[[ADD_IMAG:.*]] = add i32 %[[A_IMAG]], %[[B_IMAG]]
+// LLVM: %[[RESULT:.*]] = insertvalue { i32, i32 } poison, i32 %[[ADD_REAL]], 0
+// LLVM: %[[RESULT_2:.*]] = insertvalue { i32, i32 } %[[RESULT]], i32 %[[ADD_IMAG]], 1
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load i32, ptr %[[B_REAL_PTR]], align 4
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load i32, ptr %[[B_IMAG_PTR]], align 4
+// OGCG: %[[ADD_REAL:.*]] = add i32 %[[A_REAL]], %[[B_REAL]]
+// OGCG: %[[ADD_IMAG:.*]] = add i32 %[[A_IMAG]], %[[B_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store i32 %[[ADD_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store i32 %[[ADD_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo2() {
+  float _Complex a;
+  float _Complex b;
+  float _Complex c = a + b;
+}
+
+// CIR: %[[COMPLEX_A:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[COMPLEX_B:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[COMPLEX_A]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[COMPLEX_B]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[ADD:.*]] = cir.complex.add %[[TMP_A]], %[[TMP_B]] : !cir.complex<!cir.float>
+
+// LLVM: %[[COMPLEX_A:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[COMPLEX_B:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[COMPLEX_A]], align 4
+// LLVM: %[[TMP_B:.*]] = load { float, float }, ptr %[[COMPLEX_B]], align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
+// LLVM: %[[B_REAL:.*]] = extractvalue { float, float } %[[TMP_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { float, float } %[[TMP_B]], 1
+// LLVM: %[[ADD_REAL:.*]] = fadd float %[[A_REAL]], %[[B_REAL]]
+// LLVM: %[[ADD_IMAG:.*]] = fadd float %[[A_IMAG]], %[[B_IMAG]]
+// LLVM: %[[RESULT:.*]] = insertvalue { float, float } poison, float %[[ADD_REAL]], 0
+// LLVM: %[[RESULT_2:.*]] = insertvalue { float, float } %[[RESULT]], float %[[ADD_IMAG]], 1
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { float, float }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_A]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_A]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_B]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load float, ptr %[[B_REAL_PTR]], align 4
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_B]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load float, ptr %[[B_IMAG_PTR]], align 4
+// OGCG: %[[ADD_REAL:.*]] = fadd float %[[A_REAL]], %[[B_REAL]]
+// OGCG: %[[ADD_IMAG:.*]] = fadd float %[[A_IMAG]], %[[B_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store float %[[ADD_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[ADD_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo3() {
+  float _Complex a;
+  float _Complex b;
+  float _Complex c;
+  float _Complex d = (a + b) + c;
+}
+
+// CIR: %[[COMPLEX_A:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[COMPLEX_B:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b"]
+// CIR: %[[COMPLEX_C:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c"]
+// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["d", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[COMPLEX_A]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[COMPLEX_B]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[ADD_A_B:.*]] = cir.complex.add %[[TMP_A]], %[[TMP_B]] : !cir.complex<!cir.float>
+// CIR: %[[TMP_C:.*]] = cir.load{{.*}} %[[COMPLEX_C]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[ADD_A_B_C:.*]] = cir.complex.add %[[ADD_A_B]], %[[TMP_C]] : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[ADD_A_B_C]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX_A:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[COMPLEX_B:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[COMPLEX_C:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[COMPLEX_A]], align 4
+// LLVM: %[[TMP_B:.*]] = load { float, float }, ptr %[[COMPLEX_B]], align 4
+// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
+// LLVM: %[[B_REAL:.*]] = extractvalue { float, float } %[[TMP_B]], 0
+// LLVM: %[[B_IMAG:.*]] = extractvalue { float, float } %[[TMP_B]], 1
+// LLVM: %[[ADD_REAL_A_B:.*]] = fadd float %[[A_REAL]], %[[B_REAL]]
+// LLVM: %[[ADD_IMAG_A_B:.*]] = fadd float %[[A_IMAG]], %[[B_IMAG]]
+// LLVM: %[[A_B:.*]] = insertvalue { float, float } poison, float %[[ADD_REAL_A_B]], 0
+// LLVM: %[[TMP_A_B:.*]] = insertvalue { float, float } %[[A_B]], float %[[ADD_IMAG_A_B]], 1
+// LLVM: %[[TMP_C:.*]] = load { float, float }, ptr %[[COMPLEX_C]], align 4
+// LLVM: %[[A_B_REAL:.*]] = extractvalue { float, float } %[[TMP_A_B]], 0
+// LLVM: %[[A_B_IMAG:.*]] = extractvalue { float, float } %[[TMP_A_B]], 1
+// LLVM: %[[C_REAL:.*]] = extractvalue { float, float } %[[TMP_C]], 0
+// LLVM: %[[C_IMAG:.*]] = extractvalue { float, float } %[[TMP_C]], 1
+// LLVM: %[[ADD_REAL_A_B_C:.*]] = fadd float %[[A_B_REAL]], %[[C_REAL]]
+// LLVM: %[[ADD_IMAG_A_B_C:.*]] = fadd float %[[A_B_IMAG]], %[[C_IMAG]]
+// LLVM: %[[A_B_C:.*]] = insertvalue { float, float } poison, float %[[ADD_REAL_A_B_C]], 0
+// LLVM: %[[TMP_A_B_C:.*]] = insertvalue { float, float } %[[A_B_C]], float %[[ADD_IMAG_A_B_C]], 1
+// LLVM: store { float, float } %[[TMP_A_B_C]], ptr %[[RESULT]], align 4
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { float, float }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { float, float }, align 4
+// OGCG: %[[COMPLEX_C:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_A]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_A]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_B]], i32 0, i32 0
+// OGCG: %[[B_REAL:.*]] = load float, ptr %[[B_REAL_PTR]], align 4
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_B]], i32 0, i32 1
+// OGCG: %[[B_IMAG:.*]] = load float, ptr %[[B_IMAG_PTR]], align 4
+// OGCG: %[[ADD_REAL_A_B:.*]] = fadd float %[[A_REAL]], %[[B_REAL]]
+// OGCG: %[[ADD_IMAG_A_B:.*]] = fadd float %[[A_IMAG]], %[[B_IMAG]]
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_C]], i32 0, i32 0
+// OGCG: %[[C_REAL:.*]] = load float, ptr %[[C_REAL_PTR]], align 4
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX_C]], i32 0, i32 1
+// OGCG: %[[C_IMAG:.*]] = load float, ptr %[[C_IMAG_PTR]], align 4
+// OGCG: %[[ADD_REAL_A_B_C:.*]] = fadd float %[[ADD_REAL_A_B]], %[[C_REAL]]
+// OGCG: %[[ADD_IMAG_A_B_C:.*]] = fadd float %[[ADD_IMAG_A_B]], %[[C_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store float %[[ADD_REAL_A_B_C]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[ADD_IMAG_A_B_C]], ptr %[[RESULT_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 6e7e889df146f..88df771e6f272 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -31,7 +31,7 @@ float _Complex cf2 = { 1.0f, 2.0f };
 void foo() { int _Complex c = {}; }
 
 // CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
-// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<0> : !s32i> : !cir.complex<!s32i>
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.zero : !cir.complex<!s32i>
 // CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
 
 // LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
@@ -216,6 +216,20 @@ void foo9(double a, double b) {
 // OGCG: store double %[[TMP_A]], ptr %[[C_REAL_PTR]], align 8
 // OGCG: store double %[[TMP_B]], ptr %[[C_IMAG_PTR]], align 8
 
+void foo10() {
+  double _Complex c;
+  double *realPtr = &__real__ c;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+// CIR: %[[REAL_PTR:.*]] = cir.complex.real_ptr %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.double>> -> !cir.ptr<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+
 void foo12() {
   double _Complex c;
   double imag = __imag__ c;
@@ -741,7 +755,7 @@ void foo29() {
 }
 
 // CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a", init]
-// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<0> : !s32i> : !cir.complex<!s32i>
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.zero : !cir.complex<!s32i>
 // CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
 
 // LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
@@ -751,4 +765,4 @@ void foo29() {
 // OGCG: %[[INIT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[INIT]], i32 0, i32 0
 // OGCG: %[[INIT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[INIT]], i32 0, i32 1
 // OGCG: store i32 0, ptr %[[INIT_REAL_PTR]], align 4
-// OGCG: store i32 0, ptr %[[INIT_IMAG_PTR]], align 4
\ No newline at end of file
+// OGCG: store i32 0, ptr %[[INIT_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 4c2877f8460d0..2b06bb0f7cb08 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -219,3 +219,130 @@ void init_union() {
 // CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
 // CHECK-NEXT:    cir.call @_ZN14UnionInitStrukC1Ev(%[[S_ADDR]])
 // CHECK-NEXT:    cir.return
+
+struct Base {
+  int a;
+  Base(int val) : a(val) {}
+};
+
+struct Derived : Base {
+  Derived(int val) : Base(val) {}
+};
+
+void test_derived() {
+  Derived d(1);
+}
+
+// CHECK: cir.func{{.*}} @_ZN4BaseC2Ei(%arg0: !cir.ptr<!rec_Base> {{.*}}, %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL_ADDR:.*]] = cir.alloca {{.*}} ["val", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[A_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "a"}
+// CHECK-NEXT:   %[[VAL:.*]] = cir.load{{.*}} %[[VAL_ADDR]]
+// CHECK-NEXT:   cir.store{{.*}} %[[VAL]], %[[A_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:      cir.func{{.*}} @_ZN7DerivedC2Ei(%arg0: !cir.ptr<!rec_Derived> {{.*}}, %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL_ADDR:.*]] = cir.alloca {{.*}} ["val", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[BASE:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CHECK-NEXT:   %[[VAL:.*]] = cir.load{{.*}} %[[VAL_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN4BaseC2Ei(%[[BASE]], %[[VAL]])
+// CHECK-NEXT:   cir.return
+
+// CHECK:      cir.func{{.*}} @_ZN7DerivedC1Ei(%arg0: !cir.ptr<!rec_Derived> {{.*}}, %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL_ADDR:.*]] = cir.alloca {{.*}} ["val", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[VAL:.*]] = cir.load{{.*}} %[[VAL_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN7DerivedC2Ei(%[[THIS]], %[[VAL]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func{{.*}} @_Z12test_derivedv
+// CHECK-NEXT:    %[[D_ADDR:.*]] = cir.alloca {{.*}} ["d", init]
+// CHECK-NEXT:    %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT:    cir.call @_ZN7DerivedC1Ei(%[[D_ADDR]], %[[ONE]])
+// CHECK-NEXT:    cir.return
+
+struct Base2 {
+  int b;
+  Base2(int val) : b(val) {}
+};
+
+struct Derived2 : Base, Base2 {
+  int c;
+  Derived2(int val1, int val2, int val3) : Base(val1), Base2(val2), c(val3) {}
+};
+
+void test_derived2() {
+  Derived2 d(1, 2, 3);
+}
+
+// CHECK: cir.func{{.*}} @_ZN5Base2C2Ei(%arg0: !cir.ptr<!rec_Base2> {{.*}}, %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL_ADDR:.*]] = cir.alloca {{.*}} ["val", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[B_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "b"}
+// CHECK-NEXT:   %[[VAL:.*]] = cir.load{{.*}} %[[VAL_ADDR]]
+// CHECK-NEXT:   cir.store{{.*}} %[[VAL]], %[[B_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:      cir.func{{.*}} @_ZN8Derived2C2Eiii(%arg0: !cir.ptr<!rec_Derived2>
+// CHECK-SAME:                                    %arg1: !s32i
+// CHECK-SAME:                                    %arg2: !s32i
+// CHECK-SAME:                                    %arg3: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL1_ADDR:.*]] = cir.alloca {{.*}} ["val1", init]
+// CHECK-NEXT:   %[[VAL2_ADDR:.*]] = cir.alloca {{.*}} ["val2", init]
+// CHECK-NEXT:   %[[VAL3_ADDR:.*]] = cir.alloca {{.*}} ["val3", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL1_ADDR]]
+// CHECK-NEXT:   cir.store %arg2, %[[VAL2_ADDR]]
+// CHECK-NEXT:   cir.store %arg3, %[[VAL3_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[BASE:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_Derived2> nonnull [0] -> !cir.ptr<!rec_Base>
+// CHECK-NEXT:   %[[VAL1:.*]] = cir.load{{.*}} %[[VAL1_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN4BaseC2Ei(%[[BASE]], %[[VAL1]])
+// CHECK-NEXT:   %[[BASE2:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_Derived2> nonnull [4] -> !cir.ptr<!rec_Base2>
+// CHECK-NEXT:   %[[VAL2:.*]] = cir.load{{.*}} %[[VAL2_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN5Base2C2Ei(%[[BASE2]], %[[VAL2]])
+// CHECK-NEXT:   %[[C_ADDR:.*]] = cir.get_member %[[THIS]][2] {name = "c"}
+// CHECK-NEXT:   %[[VAL3:.*]] = cir.load{{.*}} %[[VAL3_ADDR]]
+// CHECK-NEXT:   cir.store{{.*}} %[[VAL3]], %[[C_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:      cir.func{{.*}} @_ZN8Derived2C1Eiii(%arg0: !cir.ptr<!rec_Derived2>
+// CHECK-SAME:                                    %arg1: !s32i
+// CHECK-SAME:                                    %arg2: !s32i
+// CHECK-SAME:                                    %arg3: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[VAL1_ADDR:.*]] = cir.alloca {{.*}} ["val1", init]
+// CHECK-NEXT:   %[[VAL2_ADDR:.*]] = cir.alloca {{.*}} ["val2", init]
+// CHECK-NEXT:   %[[VAL3_ADDR:.*]] = cir.alloca {{.*}} ["val3", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[VAL1_ADDR]]
+// CHECK-NEXT:   cir.store %arg2, %[[VAL2_ADDR]]
+// CHECK-NEXT:   cir.store %arg3, %[[VAL3_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[VAL1:.*]] = cir.load{{.*}} %[[VAL1_ADDR]]
+// CHECK-NEXT:   %[[VAL2:.*]] = cir.load{{.*}} %[[VAL2_ADDR]]
+// CHECK-NEXT:   %[[VAL3:.*]] = cir.load{{.*}} %[[VAL3_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN8Derived2C2Eiii(%[[THIS]], %[[VAL1]], %[[VAL2]], %[[VAL3]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func{{.*}} @_Z13test_derived2v
+// CHECK-NEXT:    %[[D_ADDR:.*]] = cir.alloca {{.*}} ["d", init]
+// CHECK-NEXT:    %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT:    %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT:    cir.call @_ZN8Derived2C1Eiii(%[[D_ADDR]], %[[ONE]], %[[TWO]], %[[THREE]])
+// CHECK-NEXT:    cir.return
diff --git a/clang/test/CIR/CodeGen/enum.cpp b/clang/test/CIR/CodeGen/enum.cpp
index 5d9b1057aaa14..247fa0a3bfd43 100644
--- a/clang/test/CIR/CodeGen/enum.cpp
+++ b/clang/test/CIR/CodeGen/enum.cpp
@@ -14,3 +14,14 @@ int f() {
 
 // CHECK: cir.func{{.*}} @_Z1fv
 // CHECK:    cir.const #cir.int<1> : !u32i
+
+namespace test {
+  using enum Numbers;
+};
+
+int f2() {
+  return test::Two;
+}
+
+// CHECK: cir.func{{.*}} @_Z2f2v
+// CHECK:    cir.const #cir.int<2> : !u32i
diff --git a/clang/test/CIR/CodeGen/namespace.cpp b/clang/test/CIR/CodeGen/namespace.cpp
index efae1f2f2f236..4c7812c61bfe4 100644
--- a/clang/test/CIR/CodeGen/namespace.cpp
+++ b/clang/test/CIR/CodeGen/namespace.cpp
@@ -93,3 +93,11 @@ void f7() {
 }
 
 // CHECK: cir.func{{.*}} @_Z2f7v()
+
+namespace test_alias = test;
+
+int f8() {
+  return test_alias::g2;
+}
+
+// CHECK: cir.func{{.*}} @_Z2f8v()
diff --git a/clang/test/CIR/IR/invalid-complex.cir b/clang/test/CIR/IR/invalid-complex.cir
index 2414809f7dbca..3a11b631a2ac7 100644
--- a/clang/test/CIR/IR/invalid-complex.cir
+++ b/clang/test/CIR/IR/invalid-complex.cir
@@ -45,3 +45,15 @@ module {
     cir.return
   }
 }
+
+
+// -----
+
+module {
+  cir.func @complex_real_ptr_invalid_result_type() -> !cir.double {
+    %0 = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c"]
+    // expected-error @below {{result type does not match operand type}}
+    %1 = cir.complex.real_ptr %0 : !cir.ptr<!cir.complex<!cir.double>> -> !cir.ptr<!cir.float>
+    cir.return
+  }
+}
diff --git a/clang/test/CIR/Transforms/complex-imag-fold.cir b/clang/test/CIR/Transforms/complex-imag-fold.cir
index 0d9a4e43142a3..56e062d5285a2 100644
--- a/clang/test/CIR/Transforms/complex-imag-fold.cir
+++ b/clang/test/CIR/Transforms/complex-imag-fold.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+// RUN: cir-opt %s -cir-canonicalize -split-input-file -o - | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
@@ -21,3 +21,19 @@ module {
   // CHECK: }
 
 }
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func dso_local @fold_complex_imag_from_create_test(%arg0: !s32i, %arg1: !s32i) -> !s32i {
+    %0 = cir.complex.create %arg0, %arg1 : !s32i -> !cir.complex<!s32i>
+    %1 = cir.complex.imag %0 : !cir.complex<!s32i> -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  // CHECK: cir.func dso_local @fold_complex_imag_from_create_test(%[[ARG_0:.*]]: !s32i, %[[ARG_1:.*]]: !s32i) -> !s32i {
+  // CHECK:   cir.return %[[ARG_1]] : !s32i
+  // CHECK: }
+}
diff --git a/clang/test/CIR/Transforms/complex-real-fold.cir b/clang/test/CIR/Transforms/complex-real-fold.cir
index 1cab9be616af0..29b03276f822d 100644
--- a/clang/test/CIR/Transforms/complex-real-fold.cir
+++ b/clang/test/CIR/Transforms/complex-real-fold.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+// RUN: cir-opt %s -cir-canonicalize -split-input-file -o - | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
@@ -21,3 +21,19 @@ module {
   // CHECK: }
 
 }
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func dso_local @fold_complex_real_from_create_test(%arg0: !s32i, %arg1: !s32i) -> !s32i {
+    %0 = cir.complex.create %arg0, %arg1 : !s32i -> !cir.complex<!s32i>
+    %1 = cir.complex.real %0 : !cir.complex<!s32i> -> !s32i
+    cir.return %1 : !s32i
+  }
+
+  // CHECK: cir.func dso_local @fold_complex_real_from_create_test(%[[ARG_0:.*]]: !s32i, %[[ARG_1:.*]]: !s32i) -> !s32i {
+  // CHECK:   cir.return %[[ARG_0]] : !s32i
+  // CHECK: }
+}
diff --git a/clang/test/CXX/basic/basic.link/p3.cpp b/clang/test/CXX/basic/basic.link/p3.cpp
index 01202264d2591..e6633a777ddef 100644
--- a/clang/test/CXX/basic/basic.link/p3.cpp
+++ b/clang/test/CXX/basic/basic.link/p3.cpp
@@ -1,35 +1,18 @@
-// RUN: %clang_cc1 -std=c++2a -verify %s
-// RUN: %clang_cc1 -std=c++2a -verify %s -DIMPORT_ERROR=1
-// RUN: %clang_cc1 -std=c++2a -verify %s -DIMPORT_ERROR=2
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
+// RUN: %clang_cc1 -std=c++20 -verify %t/M.cpp
+// RUN: %clang_cc1 -std=c++20 -verify %t/ImportError1.cpp
+// RUN: %clang_cc1 -std=c++20 -verify %t/ImportError2.cpp
+
+//--- M.cpp
 module;
 
-#if IMPORT_ERROR != 2
 struct import { struct inner {}; };
-#endif
 struct module { struct inner {}; };
-
 constexpr int n = 123;
 
 export module m; // #1
-
-// Import errors are fatal, so we test them in isolation.
-#if IMPORT_ERROR == 1
-import x = {}; // expected-error {{expected ';' after module name}}
-               // expected-error@-1 {{module 'x' not found}}
-
-#elif IMPORT_ERROR == 2
-struct X;
-template<int> struct import;
-template<> struct import<n> {
-  static X y;
-};
-
-// This is not valid because the 'import <n>' is a pp-import, even though it
-// grammatically can't possibly be an import declaration.
-struct X {} import<n>::y; // expected-error {{'n' file not found}}
-
-#else
 module y = {}; // expected-error {{multiple module declarations}} expected-error 2{{}}
 // expected-note@#1 {{previous module declaration}}
 
@@ -51,4 +34,36 @@ template<typename T> module module_var_template;
 
 // This is a variable named 'import' that shadows the type 'import' above.
 struct X {} import;
-#endif
+
+//--- ImportError1.cpp
+module;
+
+struct import { struct inner {}; };
+struct module { struct inner {}; };
+
+constexpr int n = 123;
+
+export module m; // #1
+
+import x = {}; // expected-error {{expected ';' after module name}}
+               // expected-error@-1 {{module 'x' not found}}
+
+//--- ImportError2.cpp
+module;
+
+struct module { struct inner {}; };
+
+constexpr int n = 123;
+
+export module m; // #1
+
+struct X;
+template<int> struct import;
+template<> struct import<n> {
+  static X y;
+};
+
+// This is not valid because the 'import <n>' is a pp-import, even though it
+// grammatically can't possibly be an import declaration.
+struct X {} import<n>::y; // expected-error {{'n' file not found}}
+
diff --git a/clang/test/CXX/dcl.dcl/dcl.enum/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.enum/p2.cpp
index de826d0570422..7b69358687a2f 100644
--- a/clang/test/CXX/dcl.dcl/dcl.enum/p2.cpp
+++ b/clang/test/CXX/dcl.dcl/dcl.enum/p2.cpp
@@ -1,6 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
 
-// expected-no-diagnostics
-enum class E : int const volatile { };
+enum class E : int const volatile { }; // expected-warning {{'const' and 'volatile' qualifiers in enumeration underlying type ignored}}
 using T = __underlying_type(E);
 using T = int;
diff --git a/clang/test/CXX/expr/expr.post/expr.static.cast/p3-0x.cpp b/clang/test/CXX/expr/expr.post/expr.static.cast/p3-0x.cpp
index 830ccda245baa..9e089557d7088 100644
--- a/clang/test/CXX/expr/expr.post/expr.static.cast/p3-0x.cpp
+++ b/clang/test/CXX/expr/expr.post/expr.static.cast/p3-0x.cpp
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -verify %s
 
 // A glvalue of type "cv1 T1" can be cast to type "rvalue reference to
 // cv2 T2" if "cv2 T2" is reference-compatible with "cv1 T1" (8.5.3).
@@ -23,3 +22,70 @@ void test(A &a, B &b) {
   const A &&ar10 = static_cast<const A&&>(xvalue<A>());
   const A &&ar11 = static_cast<const A&&>(xvalue<B>());
 }
+
+namespace GH121429 {
+
+struct C : private A { // expected-note 4 {{declared private here}}
+    C&& that();
+
+    void f() {
+        static_cast<A&&>(*this);
+        static_cast<const A&&>(*this);
+
+        static_cast<A&&>(that());
+        static_cast<const A&&>(that());
+    }
+};
+C c;
+const C cc;
+
+void f() {
+    static_cast<A&&>(c);        // expected-error {{cannot cast 'C' to its private base class 'A'}}
+    static_cast<A&&>(c.that()); // expected-error {{cannot cast 'C' to its private base class 'A'}}
+
+    static_cast<const A&&>(c);        // expected-error {{cannot cast 'C' to its private base class 'const A'}}
+    static_cast<const A&&>(c.that()); // expected-error {{cannot cast 'C' to its private base class 'const A'}}
+}
+
+constexpr bool g() {
+    (A&&)c;
+    (A&&)(C&&)c;
+    (A&&)cc;
+    (A&&)(const C&&)c;
+    (const A&&)c;
+    (const A&&)(C&&)c;
+    (const A&&)cc;
+    (const A&&)(const C&&)c;
+    return true;
+}
+static_assert(g(), "");
+
+struct D : A, B { // expected-warning {{direct base 'A' is inaccessible due to ambiguity}}
+    D&& rv();
+};
+D d;
+
+void h(const D cd) {
+    static_cast<A&&>(d);      // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    static_cast<A&&>(d.rv()); // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+
+    static_cast<const A&&>(d);      // expected-error {{ambiguous conversion from derived class 'D' to base class 'const A'}}
+    static_cast<const A&&>(d.rv()); // expected-error {{ambiguous conversion from derived class 'D' to base class 'const A'}}
+
+    (A&&)d;                  // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (A&&)(D&&)d;             // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (A&&)cd;                 // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (A&&)(const D&&)d;       // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (const A&&)d;            // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (const A&&)(D&&)d;       // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (const A&&)cd;           // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+    (const A&&)(const D&&)d; // expected-error {{ambiguous conversion from derived class 'D' to base class 'A'}}
+}
+
+template<class T, class U>
+auto s(U u = {}) -> decltype(static_cast<T&&>(u)); // expected-note 2 {{substitution failure}}
+
+int i = s<A, C>(); // expected-error {{no matching function}}
+int j = s<A, D>(); // expected-error {{no matching function}}
+
+}
diff --git a/clang/test/ClangScanDeps/visible-modules.c b/clang/test/ClangScanDeps/visible-modules.c
new file mode 100644
index 0000000000000..77716a4956f00
--- /dev/null
+++ b/clang/test/ClangScanDeps/visible-modules.c
@@ -0,0 +1,116 @@
+// This test verifies that the modules visible to the translation unit are computed in dependency scanning.
+// "client" in the first scan represents the translation unit that imports an explicit submodule, 
+//    that only exports one other module. 
+// In the second scan, the translation unit that imports an explicit submodule, 
+//    that exports an additional module. 
+// Thus, the dependencies of the top level module for the submodule always differ from what is visible to the TU.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/compile-commands.json.in > %t/compile-commands.json
+// RUN: clang-scan-deps -emit-visible-modules -compilation-database %t/compile-commands.json \
+// RUN:   -j 1 -format experimental-full 2>&1 > %t/result-first-scan.json
+// RUN: cat %t/result-first-scan.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t --check-prefix=SINGLE
+
+/// Re-run scan with different module map for direct dependency.
+// RUN: mv %t/A_with_visible_export.modulemap %t/Sysroot/usr/include/A/module.modulemap
+// RUN: clang-scan-deps -emit-visible-modules -compilation-database %t/compile-commands.json \
+// RUN:   -j 1 -format experimental-full 2>&1 > %t/result.json
+// RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t --check-prefix=MULTIPLE
+
+// RUN: %deps-to-rsp %t/result.json --module-name=transitive > %t/transitive.rsp
+// RUN: %deps-to-rsp %t/result.json --module-name=visible > %t/visible.rsp
+// RUN: %deps-to-rsp %t/result.json --module-name=invisible > %t/invisible.rsp
+// RUN: %deps-to-rsp %t/result.json --module-name=A > %t/A.rsp
+// RUN: %deps-to-rsp %t/result.json --tu-index=0 > %t/tu.rsp
+
+// RUN: %clang @%t/transitive.rsp
+// RUN: %clang @%t/visible.rsp
+// RUN: %clang @%t/invisible.rsp
+// RUN: %clang @%t/A.rsp
+
+/// Verify compilation & scan agree with each other.
+// RUN: not %clang @%t/tu.rsp 2>&1 | FileCheck %s --check-prefix=COMPILE
+
+// SINGLE:        "visible-clang-modules": [
+// SINGLE-NEXT:     "A"
+// SINGLE-NEXT:   ]
+
+// MULTIPLE:        "visible-clang-modules": [
+// MULTIPLE-NEXT:     "A",
+// MULTIPLE-NEXT:     "visible"
+// MULTIPLE-NEXT:   ]
+
+// COMPILE-NOT:   'visible_t' must be declared before it is used
+// COMPILE:       'transitive_t' must be declared before it is used
+// COMPILE:       'invisible_t' must be declared before it is used
+
+//--- compile-commands.json.in
+[
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/client.c -isysroot DIR/Sysroot -IDIR/Sysroot/usr/include -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/client.c"
+}
+]
+
+//--- Sysroot/usr/include/A/module.modulemap
+module A {
+  explicit module visibleToTU {
+    header "visibleToTU.h"
+  }
+  explicit module invisibleToTU {
+    header "invisibleToTU.h" 
+  }
+}
+
+//--- A_with_visible_export.modulemap
+module A {
+  explicit module visibleToTU {
+    header "visibleToTU.h"
+    export visible
+  }
+  explicit module invisibleToTU {
+    header "invisibleToTU.h" 
+  }
+}
+
+//--- Sysroot/usr/include/A/visibleToTU.h
+#include <visible/visible.h>
+typedef int A_visibleToTU;
+
+//--- Sysroot/usr/include/A/invisibleToTU.h
+#include <invisible/invisible.h>
+typedef int A_invisibleToTU;
+
+//--- Sysroot/usr/include/invisible/module.modulemap
+module invisible {
+  umbrella "."
+}
+
+//--- Sysroot/usr/include/invisible/invisible.h
+typedef int invisible_t;
+
+//--- Sysroot/usr/include/visible/module.modulemap
+module visible {
+  umbrella "."
+}
+
+//--- Sysroot/usr/include/visible/visible.h
+#include <transitive/transitive.h>
+typedef int visible_t;
+
+//--- Sysroot/usr/include/transitive/module.modulemap
+module transitive {
+  umbrella "."
+}
+
+//--- Sysroot/usr/include/transitive/transitive.h
+typedef int transitive_t;
+
+//--- client.c
+#include <A/visibleToTU.h> 
+visible_t foo_v(void);
+// Both decls are not visible, thus should fail to actually compile.
+transitive_t foo_t(void);
+invisible_t foo_i(void); 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
index c734c6953e5d1..5b74042329968 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
@@ -300,19 +300,19 @@ int test_variadic_template() __arm_inout("za") {
               preserves_za_decl);
 }
 
-// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_AGNOSTIC]] = { mustprogress noinline nounwind "aarch64_za_state_agnostic" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[ZA_AGNOSTIC]] = { mustprogress noinline nounwind "aarch64_za_state_agnostic" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
+// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sme" }
 // CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
 // CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
 // CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_add-i64.c
index c341ff9bb29e6..2889249662ae1 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_add-i64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za32.c
index 824c43e6d247c..992c6f0c62ce7 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za64.c
index 61c41450d6457..3e22c77a467f9 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mopa-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za32.c
index 509ad9ec17f73..40d2c6d0cc865 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za32.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za64.c
index 9d205beb05f28..3c7b18da14620 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_mops-za64.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/aarch64-sme2-attrs.cpp b/clang/test/CodeGen/AArch64/sme2-intrinsics/aarch64-sme2-attrs.cpp
index 15b9ac42cbcf3..b1649efa0adf6 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/aarch64-sme2-attrs.cpp
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/aarch64-sme2-attrs.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 \
 // RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add.c
index 7e7597f82136c..ad70f6de9e66b 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add_sub_za16.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add_sub_za16.c
index 6bd9eab5f1846..9cc9963840921 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add_sub_za16.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_add_sub_za16.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature  +sme-f8f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature  +sme-f8f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature  +sme-f8f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature  +sme-f8f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
 
-// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -S -Werror -Wall -o /dev/null
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -S -Werror -Wall -o /dev/null
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bmop.c
index 84521e7e47644..408772d7feebe 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bmop.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_bmop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_clamp.c
index 1297185c4b50e..c78a3e4485379 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_clamp.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 \
 // RUN:  -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvt.c
index 2851ea9ccd22c..d8d07365fa6e8 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvt.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvtn.c
index b38bf6fd35084..7e40de7d88616 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvtn.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_cvtn.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fmlas16.c
index 4338ea030b504..77f9aaaa17767 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fmlas16.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fmlas16.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme-f16f16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -S -Werror -Wall %s -o /dev/null
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -S -Werror -Wall %s -o /dev/null
 
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp_dots.c
index 00cbfdbe7ca34..22af781de3d54 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp_dots.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp_dots.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_frint.c
index 8ab450587fc70..64bb8ed369cf4 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_frint.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_frint.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_int_dots.c
index 961133b89eb29..190438b2c7a02 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_int_dots.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_int_dots.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index 1ab02afbe0904..51737587ff3d7 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index e97075703b185..8a3fcd34291a8 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index 0730812b1f06f..8b8dc97b1314c 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index b687b580b15a6..0ef2fdcb4486c 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index 1a9e9d84c6359..16c293d7d4255 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index c4c89358c16f8..36ae4d24be372 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -2,9 +2,9 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_max.c
index 5d57ffb9bdf8c..dd06b232de01d 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_max.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_max.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_maxnm.c
index 1d47abe8d487c..42d12b3130c1b 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_maxnm.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_maxnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_min.c
index 4e70a39311664..f583f639edbd5 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_min.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_min.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_minnm.c
index 838cb644e5e39..037484596098e 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_minnm.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_minnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mla.c
index 74859c0a23bbb..484a7135e95d0 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mla.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mla.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlal.c
index 9d8b1351debc2..6f7c31fd8963a 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlal.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlall.c
index 4efc226c10e68..34686fff93792 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlall.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mls.c
index 2b1554cd9d8b0..171d6ca75de3d 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mls.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mls.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlsl.c
index e56ffaa1db03e..9d771d651002f 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mlsl.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop.c
index d25e923c24400..198d5e2fcf762 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
index 36e0b754b1d35..d6e766f7b182a 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
 #include <arm_sme.h>
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
index cd12aa523f7cb..03738c083892a 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
 #include <arm_sme.h>
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c
index f5b6c566c2602..af81d71105763 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
 #include <arm_sme.h>
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mopa_nonwide.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mopa_nonwide.c
index 0eb391c7d79a0..3c7e14c7bddcf 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mopa_nonwide.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mopa_nonwide.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
-// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX
 
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -S -O2 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-b16b16 -target-feature +sme2p1 -target-feature +sme-f16f16 -S -O2 -Werror -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_read.c
index c9d532d5fce45..1ae82365c6590 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_read.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index 5b61102751494..4c059e350a182 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -4,10 +4,10 @@
 
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sqdmulh.c
index 5ff801666df88..0df25e76167ef 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sqdmulh.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sqdmulh.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sub.c
index aca66e0d78d66..a44150ec95df1 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sub.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_sub.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx2.c
index d3b09f071c58f..a78e68726058d 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx2.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx4.c
index 45bc83eac7339..323493e9f2299 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_unpkx4.c
@@ -2,12 +2,12 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vdot.c
index d9445ef03b8c1..03ce662fae0de 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vdot.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vdot.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_add.c
index de983bcf79309..da85a0dad64af 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_add.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_add.c
@@ -2,11 +2,11 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_qrshr.c
index 3e47a3ecc17c0..28a3d71c66cf6 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_qrshr.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_qrshr.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_rshl.c
index af5a389c7f736..21220db3d2dbd 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_rshl.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_rshl.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx2.c
index 893cc7519a1d0..f3fbae052ffe0 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx4.c
index d4e77d998e3c2..7d2ef9a694ad4 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_selx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx2.c
index ec3a2952b2ac6..5f020081de521 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx4.c
index aeac2ae78f6e6..4085e84a460dd 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_uzpx4.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx2.c
index 735b3697f150b..5b4b971f6b29e 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx2.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx2.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx4.c
index 341ae290e9b0e..721465fdaa306 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx4.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_vector_zipx4.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write.c
index 7d0fbc9479a87..e05a60f7b23f0 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write_lane_zt.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write_lane_zt.c
index f295bfb2d18b4..e82273294f173 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write_lane_zt.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_write_lane_zt.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme2  -target-feature +sme  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme2  -target-feature +sme  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme  -target-feature +sme2  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme  -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme  -target-feature +sme2 -target-feature +sme-lutv2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2  -target-feature +sme  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme2  -target-feature +sme  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme  -target-feature +sme2  -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme  -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme  -target-feature +sme2 -target-feature +sme-lutv2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_movaz.c
index 98324e78b16bc..bdfc6c15c5a79 100644
--- a/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_movaz.c
+++ b/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: aarch64-registered-target
-//RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+//RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_zero.c b/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_zero.c
index 7053f056e8eba..3902ca69c6973 100644
--- a/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_zero.c
+++ b/clang/test/CodeGen/AArch64/sme2p1-intrinsics/acle_sme2p1_zero.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta-bfloat.c
deleted file mode 100644
index 51f035906b21e..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta-bfloat.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svclasta_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svclasta_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svclasta_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svclasta_bf16'}}
-  return SVE_ACLE_FUNC(svclasta, _bf16, , )(pg, fallback, data);
-}
-
-// CHECK-LABEL: @test_svclasta_n_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svclasta_n_bf16u10__SVBool_tu6__bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-bfloat16_t test_svclasta_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svclasta_n_bf16'}}
-  return SVE_ACLE_FUNC(svclasta, _n_bf16, , )(pg, fallback, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta.c
index 4712d57be729b..f11decddf05bb 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clasta.c
@@ -423,3 +423,37 @@ float64_t test_svclasta_n_f64(svbool_t pg, float64_t fallback, svfloat64_t data)
 {
   return SVE_ACLE_FUNC(svclasta,_n_f64,,)(pg, fallback, data);
 }
+
+// CHECK-LABEL: @test_svclasta_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svclasta_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svclasta_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svclasta_bf16'}}
+  return SVE_ACLE_FUNC(svclasta, _bf16, , )(pg, fallback, data);
+}
+
+// CHECK-LABEL: @test_svclasta_n_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svclasta_n_bf16u10__SVBool_tu6__bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+bfloat16_t test_svclasta_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svclasta_n_bf16'}}
+  return SVE_ACLE_FUNC(svclasta, _n_bf16, , )(pg, fallback, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb-bfloat.c
deleted file mode 100644
index 2ee31baf476a0..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb-bfloat.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svclastb_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svclastb_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svclastb_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svclastb_bf16'}}
-  return SVE_ACLE_FUNC(svclastb, _bf16, , )(pg, fallback, data);
-}
-
-// CHECK-LABEL: @test_svclastb_n_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svclastb_n_bf16u10__SVBool_tu6__bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
-// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-bfloat16_t test_svclastb_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svclastb_n_bf16'}}
-  return SVE_ACLE_FUNC(svclastb, _n_bf16, , )(pg, fallback, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb.c
index caa5dd9381ab2..394ca9fd0d50d 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_clastb.c
@@ -423,3 +423,37 @@ float64_t test_svclastb_n_f64(svbool_t pg, float64_t fallback, svfloat64_t data)
 {
   return SVE_ACLE_FUNC(svclastb,_n_f64,,)(pg, fallback, data);
 }
+
+// CHECK-LABEL: @test_svclastb_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svclastb_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svclastb_bf16(svbool_t pg, svbfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svclastb_bf16'}}
+  return SVE_ACLE_FUNC(svclastb, _bf16, , )(pg, fallback, data);
+}
+
+// CHECK-LABEL: @test_svclastb_n_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svclastb_n_bf16u10__SVBool_tu6__bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> [[TMP0]], bfloat [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]])
+// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+bfloat16_t test_svclastb_n_bf16(svbool_t pg, bfloat16_t fallback, svbfloat16_t data) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svclastb_n_bf16'}}
+  return SVE_ACLE_FUNC(svclastb, _n_bf16, , )(pg, fallback, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt-bfloat.c
deleted file mode 100644
index 9c28182166702..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt-bfloat.c
+++ /dev/null
@@ -1,74 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svcnt_bf16_z(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_zu10__SVBool_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-svuint16_t test_svcnt_bf16_z(svbool_t pg, svbfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_z'}}
-  return SVE_ACLE_FUNC(svcnt, _bf16, _z, )(pg, op);
-}
-
-// CHECK-LABEL: @test_svcnt_bf16_m(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_mu12__SVUint16_tu10__SVBool_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-svuint16_t test_svcnt_bf16_m(svuint16_t inactive, svbool_t pg, svbfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_m'}}
-  return SVE_ACLE_FUNC(svcnt, _bf16, _m, )(inactive, pg, op);
-}
-// CHECK-LABEL: @test_svcnt_bf16_x(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_xu10__SVBool_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
-//
-svuint16_t test_svcnt_bf16_x(svbool_t pg, svbfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_x'}}
-  return SVE_ACLE_FUNC(svcnt, _bf16, _x, )(pg, op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt.c
index fe545adbd6a10..a45ebdef63d4b 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_cnt.c
@@ -570,3 +570,53 @@ svuint64_t test_svcnt_f64_x(svbool_t pg, svfloat64_t op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svcnt,_f64,_x,)(pg, op);
 }
+
+// CHECK-LABEL: @test_svcnt_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_zu10__SVBool_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svcnt_bf16_z(svbool_t pg, svbfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_z'}}
+  return SVE_ACLE_FUNC(svcnt, _bf16, _z, )(pg, op);
+}
+
+// CHECK-LABEL: @test_svcnt_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_mu12__SVUint16_tu10__SVBool_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svcnt_bf16_m(svuint16_t inactive, svbool_t pg, svbfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_m'}}
+  return SVE_ACLE_FUNC(svcnt, _bf16, _m, )(inactive, pg, op);
+}
+// CHECK-LABEL: @test_svcnt_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svcnt_bf16_xu10__SVBool_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.cnt.nxv8bf16(<vscale x 8 x i16> undef, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+svuint16_t test_svcnt_bf16_x(svbool_t pg, svbfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svcnt_bf16_x'}}
+  return SVE_ACLE_FUNC(svcnt, _bf16, _x, )(pg, op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2-bfloat.c
deleted file mode 100644
index 4691172b14a69..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2-bfloat.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svcreate2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svcreate2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1) ATTR
-{
-  return SVE_ACLE_FUNC(svcreate2,_bf16,,)(x0, x1);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2.c
index 0809250370a68..ca023e6501504 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create2.c
@@ -225,3 +225,20 @@ svmfloat8x2_t test_svcreate2_mf8(svmfloat8_t x0, svmfloat8_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_mf8,,)(x0, x1);
 }
+
+// CHECK-LABEL: @test_svcreate2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svcreate2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1) ATTR
+{
+  return SVE_ACLE_FUNC(svcreate2,_bf16,,)(x0, x1);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3-bfloat.c
deleted file mode 100644
index 3e2bd259e5c7c..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3-bfloat.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svcreate3_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svcreate3_bf16u14__SVBfloat16_tS_S_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2) ATTR
-{
-  return SVE_ACLE_FUNC(svcreate3,_bf16,,)(x0, x1, x2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3.c
index 3b003bd534b25..c46ee2a057c73 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create3.c
@@ -249,3 +249,22 @@ svmfloat8x3_t test_svcreate3_mf8(svmfloat8_t x0, svmfloat8_t x1, svmfloat8_t x2)
 {
   return SVE_ACLE_FUNC(svcreate3,_mf8,,)(x0, x1, x2);
 }
+
+// CHECK-LABEL: @test_svcreate3_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svcreate3_bf16u14__SVBfloat16_tS_S_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2) ATTR
+{
+  return SVE_ACLE_FUNC(svcreate3,_bf16,,)(x0, x1, x2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4-bfloat.c
deleted file mode 100644
index e821c7501d7a9..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4-bfloat.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svcreate4_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X4:%.*]], 3
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svcreate4_bf16u14__SVBfloat16_tS_S_S_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X4:%.*]], 3
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4) ATTR
-{
-  return SVE_ACLE_FUNC(svcreate4,_bf16,,)(x0, x1, x2, x4);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4.c
index 79a88d271f340..a0b6fcb217a52 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_create4.c
@@ -273,3 +273,24 @@ svmfloat8x4_t test_svcreate4_mf8(svmfloat8_t x0, svmfloat8_t x1, svmfloat8_t x2,
 {
   return SVE_ACLE_FUNC(svcreate4,_mf8,,)(x0, x1, x2, x4);
 }
+
+// CHECK-LABEL: @test_svcreate4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X4:%.*]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svcreate4_bf16u14__SVBfloat16_tS_S_S_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[X0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X4:%.*]], 3
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4) ATTR
+{
+  return SVE_ACLE_FUNC(svcreate4,_bf16,,)(x0, x1, x2, x4);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup-bfloat.c
deleted file mode 100644
index 2e58b503dd81d..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup-bfloat.c
+++ /dev/null
@@ -1,112 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svdup_n_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP:%.*]], i64 0
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svdup_n_bf16u6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP:%.*]], i64 0
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svdup_n_bf16(bfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16'}}
-  return SVE_ACLE_FUNC(svdup, _n, _bf16, )(op);
-}
-
-// CHECK-LABEL: @test_svdup_n_bf16_z(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_zu10__SVBool_tu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svdup_n_bf16_z(svbool_t pg, bfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_z'}}
-  return SVE_ACLE_FUNC(svdup, _n, _bf16_z, )(pg, op);
-}
-
-// CHECK-LABEL: @test_svdup_n_bf16_m(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_mu14__SVBfloat16_tu10__SVBool_tu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svdup_n_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_m'}}
-  return SVE_ACLE_FUNC(svdup, _n, _bf16_m, )(inactive, pg, op);
-}
-
-// CHECK-LABEL: @test_svdup_n_bf16_x(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_xu10__SVBool_tu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svdup_n_bf16_x(svbool_t pg, bfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_x'}}
-  return SVE_ACLE_FUNC(svdup, _n, _bf16_x, )(pg, op);
-}
-
-// CHECK-LABEL: @test_svdup_lane_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[INDEX:%.*]], i64 0
-// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svdup_lane_bf16u14__SVBfloat16_tt(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[INDEX:%.*]], i64 0
-// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svdup_lane_bf16(svbfloat16_t data, uint16_t index) MODE_ATTR
-{
-  // expected-warning@+1 {{implicit declaration of function 'svdup_lane_bf16'}}
-  return SVE_ACLE_FUNC(svdup_lane,_bf16,,)(data, index);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup.c
index fec77ceb463ff..e76a5df69fc5f 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dup.c
@@ -1040,3 +1040,91 @@ svbool_t test_svdup_n_b64(bool op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svdup,_n,_b64,)(op);
 }
+
+// CHECK-LABEL: @test_svdup_n_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP:%.*]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svdup_n_bf16u6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x bfloat> poison, bfloat [[OP:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <vscale x 8 x bfloat> [[DOTSPLATINSERT]], <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svdup_n_bf16(bfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16, )(op);
+}
+
+// CHECK-LABEL: @test_svdup_n_bf16_z(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_zu10__SVBool_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svdup_n_bf16_z(svbool_t pg, bfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_z'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_z, )(pg, op);
+}
+
+// CHECK-LABEL: @test_svdup_n_bf16_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_mu14__SVBfloat16_tu10__SVBool_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> [[INACTIVE:%.*]], <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svdup_n_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_m'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_m, )(inactive, pg, op);
+}
+
+// CHECK-LABEL: @test_svdup_n_bf16_x(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svdup_n_bf16_xu10__SVBool_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> [[TMP0]], bfloat [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svdup_n_bf16_x(svbool_t pg, bfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svdup_n_bf16_x'}}
+  return SVE_ACLE_FUNC(svdup, _n, _bf16_x, )(pg, op);
+}
+
+// CHECK-LABEL: @test_svdup_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[INDEX:%.*]], i64 0
+// CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svdup_lane_bf16u14__SVBfloat16_tt(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[INDEX:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[DOTSPLAT]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svdup_lane_bf16(svbfloat16_t data, uint16_t index) MODE_ATTR
+{
+  // expected-warning@+1 {{implicit declaration of function 'svdup_lane_bf16'}}
+  return SVE_ACLE_FUNC(svdup_lane,_bf16,,)(data, index);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq-bfloat.c
deleted file mode 100644
index 45e30aa20f29a..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq-bfloat.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svdupq_lane_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], i64 [[INDEX:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z21test_svdupq_lane_bf16u14__SVBfloat16_tm(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], i64 [[INDEX:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svdupq_lane_bf16(svbfloat16_t data, uint64_t index) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svdupq_lane_bf16'}}
-  return SVE_ACLE_FUNC(svdupq_lane, _bf16, , )(data, index);
-}
-// CHECK-LABEL: @test_svdupq_n_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[X0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[X1:%.*]], i64 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[X2:%.*]], i64 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[X3:%.*]], i64 3
-// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[X4:%.*]], i64 4
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[X5:%.*]], i64 5
-// CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[X6:%.*]], i64 6
-// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[X7:%.*]], i64 7
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP7]], i64 0)
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[TMP8]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP9]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svdupq_n_bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[X0:%.*]], i64 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[X1:%.*]], i64 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[X2:%.*]], i64 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[X3:%.*]], i64 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[X4:%.*]], i64 4
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[X5:%.*]], i64 5
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[X6:%.*]], i64 6
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[X7:%.*]], i64 7
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP7]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[TMP8]], i64 0)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP9]]
-//
-svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3,
-                                bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) MODE_ATTR {
-  // <assume other insertelement>
-  // expected-warning@+1 {{implicit declaration of function 'svdupq_n_bf16'}}
-  return SVE_ACLE_FUNC(svdupq, _n, _bf16, )(x0, x1, x2, x3, x4, x5, x6, x7);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq.c
index 5671383dc7339..5928de5fea1ff 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_dupq.c
@@ -875,3 +875,52 @@ svbool_t test_svdupq_n_b64(bool x0, bool x1) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svdupq,_n,_b64,)(x0, x1);
 }
+
+// CHECK-LABEL: @test_svdupq_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], i64 [[INDEX:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svdupq_lane_bf16u14__SVBfloat16_tm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], i64 [[INDEX:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svdupq_lane_bf16(svbfloat16_t data, uint64_t index) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svdupq_lane_bf16'}}
+  return SVE_ACLE_FUNC(svdupq_lane, _bf16, , )(data, index);
+}
+// CHECK-LABEL: @test_svdupq_n_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[X0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[X1:%.*]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[X2:%.*]], i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[X3:%.*]], i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[X4:%.*]], i64 4
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[X5:%.*]], i64 5
+// CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[X6:%.*]], i64 6
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[X7:%.*]], i64 7
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP7]], i64 0)
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[TMP8]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP9]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svdupq_n_bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16u6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[X0:%.*]], i64 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[X1:%.*]], i64 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[X2:%.*]], i64 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[X3:%.*]], i64 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[X4:%.*]], i64 4
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[X5:%.*]], i64 5
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[X6:%.*]], i64 6
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[X7:%.*]], i64 7
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP7]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> [[TMP8]], i64 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP9]]
+//
+svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3,
+                                bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) MODE_ATTR {
+  // <assume other insertelement>
+  // expected-warning@+1 {{implicit declaration of function 'svdupq_n_bf16'}}
+  return SVE_ACLE_FUNC(svdupq, _n, _bf16, )(x0, x1, x2, x3, x4, x5, x6, x7);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext-bfloat.c
deleted file mode 100644
index 69f7c059527d4..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest  -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest  -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svext_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ext.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 127)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svext_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ext.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 127)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svext_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  // expected-warning@+1 {{implicit declaration of function 'svext_bf16'}}
-  return SVE_ACLE_FUNC(svext,_bf16,,)(op1, op2, 127);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext.c
index e5b8e7e8a270c..9cdc95788bb7c 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ext.c
@@ -246,3 +246,19 @@ svfloat64_t test_svext_f64(svfloat64_t op1, svfloat64_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svext,_f64,,)(op1, op2, 31);
 }
+
+// CHECK-LABEL: @test_svext_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ext.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 127)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svext_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ext.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i32 127)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svext_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  // expected-warning@+1 {{implicit declaration of function 'svext_bf16'}}
+  return SVE_ACLE_FUNC(svext,_bf16,,)(op1, op2, 127);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2-bfloat.c
deleted file mode 100644
index 05d9ec6684794..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2-bfloat.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svget2_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget2_bf16_014svbfloat16x2_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 0);
-}
-
-// CHECK-LABEL: @test_svget2_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget2_bf16_114svbfloat16x2_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 1);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2.c
index 66fd925db9359..ac93ac0073404 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get2.c
@@ -250,3 +250,41 @@ svmfloat8_t test_svget2_mf8(svmfloat8x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_mf8,,)(tuple, 0);
 }
+
+// CHECK-LABEL: @test_svget2_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget2_bf16_014svbfloat16x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 0);
+}
+
+// CHECK-LABEL: @test_svget2_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget2_bf16_114svbfloat16x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3-bfloat.c
deleted file mode 100644
index 950c4dad9749a..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3-bfloat.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svget3_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_014svbfloat16x3_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 0);
-}
-
-// CHECK-LABEL: @test_svget3_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_114svbfloat16x3_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 1);
-}
-
-// CHECK-LABEL: @test_svget3_bf16_2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_214svbfloat16x3_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
-//
-svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3.c
index db11a42eded6e..c81d0866adb5e 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get3.c
@@ -275,3 +275,66 @@ svmfloat8_t test_svget3_mf8(svmfloat8x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_mf8,,)(tuple, 0);
 }
+
+// CHECK-LABEL: @test_svget3_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_014svbfloat16x3_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 0);
+}
+
+// CHECK-LABEL: @test_svget3_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_114svbfloat16x3_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 1);
+}
+
+// CHECK-LABEL: @test_svget3_bf16_2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget3_bf16_214svbfloat16x3_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP3]]
+//
+svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4-bfloat.c
deleted file mode 100644
index 55b379140ca3d..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4-bfloat.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svget4_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_014svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 0);
-}
-
-// CHECK-LABEL: @test_svget4_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_114svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 1);
-}
-
-// CHECK-LABEL: @test_svget4_bf16_2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_214svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 2);
-}
-
-// CHECK-LABEL: @test_svget4_bf16_3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_314svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple) ATTR
-{
-  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 3);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4.c
index d2661d3a1d54e..3968d49799181 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_get4.c
@@ -300,3 +300,95 @@ svmfloat8_t test_svget4_mf8(svmfloat8x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_mf8,,)(tuple, 0);
 }
+
+// CHECK-LABEL: @test_svget4_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_014svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 0);
+}
+
+// CHECK-LABEL: @test_svget4_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_114svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 1);
+}
+
+// CHECK-LABEL: @test_svget4_bf16_2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_214svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 2);
+}
+
+// CHECK-LABEL: @test_svget4_bf16_3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svget4_bf16_314svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple) ATTR
+{
+  return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 3);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr-bfloat.c
deleted file mode 100644
index f65da9b6140de..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svinsr_n_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], bfloat [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svinsr_n_bf16u14__SVBfloat16_tu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], bfloat [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svinsr_n_bf16(svbfloat16_t op1, bfloat16_t op2) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svinsr_n_bf16'}}
-  return SVE_ACLE_FUNC(svinsr, _n_bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr.c
index fae3220a02f95..cb2dd26722c6e 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_insr.c
@@ -186,3 +186,18 @@ svfloat64_t test_svinsr_n_f64(svfloat64_t op1, float64_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svinsr,_n_f64,,)(op1, op2);
 }
+
+// CHECK-LABEL: @test_svinsr_n_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], bfloat [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svinsr_n_bf16u14__SVBfloat16_tu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], bfloat [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svinsr_n_bf16(svbfloat16_t op1, bfloat16_t op2) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svinsr_n_bf16'}}
+  return SVE_ACLE_FUNC(svinsr, _n_bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta-bfloat.c
deleted file mode 100644
index 3ef2a3fc9397c..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta-bfloat.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svlasta_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svlasta_bf16u10__SVBool_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-bfloat16_t test_svlasta_bf16(svbool_t pg, svbfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svlasta_bf16'}}
-  return SVE_ACLE_FUNC(svlasta, _bf16, , )(pg, op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta.c
index 5db5138981dce..afd49f558c406 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lasta.c
@@ -204,3 +204,20 @@ float64_t test_svlasta_f64(svbool_t pg, svfloat64_t op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svlasta,_f64,,)(pg, op);
 }
+
+// CHECK-LABEL: @test_svlasta_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svlasta_bf16u10__SVBool_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+bfloat16_t test_svlasta_bf16(svbool_t pg, svbfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svlasta_bf16'}}
+  return SVE_ACLE_FUNC(svlasta, _bf16, , )(pg, op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb-bfloat.c
deleted file mode 100644
index d2caab2bd5dfe..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb-bfloat.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svlastb_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svlastb_bf16u10__SVBool_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
-//
-bfloat16_t test_svlastb_bf16(svbool_t pg, svbfloat16_t op) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svlastb_bf16'}}
-  return SVE_ACLE_FUNC(svlastb, _bf16, , )(pg, op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb.c
index ce87ff77c2c83..7d2908af84675 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_lastb.c
@@ -204,3 +204,20 @@ float64_t test_svlastb_f64(svbool_t pg, svfloat64_t op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svlastb,_f64,,)(pg, op);
 }
+
+// CHECK-LABEL: @test_svlastb_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svlastb_bf16u10__SVBool_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret bfloat [[TMP1]]
+//
+bfloat16_t test_svlastb_bf16(svbool_t pg, svbfloat16_t op) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svlastb_bf16'}}
+  return SVE_ACLE_FUNC(svlastb, _bf16, , )(pg, op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1-bfloat.c
deleted file mode 100644
index aaf4e652cd145..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1-bfloat.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svld1_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld1,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svld1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svld1_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
-//
-svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld1_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
index 40dcd65f6c609..19e5243c8a625 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
@@ -1201,3 +1201,45 @@ svfloat64_t test_svld1_gather_u64base_index_f64(svbool_t pg, svuint64_t bases, i
 }
 
 #endif
+
+// CHECK-LABEL: @test_svld1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld1_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld1,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld1_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
+//
+svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld1_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro-bfloat.c
deleted file mode 100644
index 5107877ae361c..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro-bfloat.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -target-feature +f64mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld1ro_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svld1ro_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svld1ro_bf16(svbool_t pg, const bfloat16_t *base) {
-  return SVE_ACLE_FUNC(svld1ro, _bf16, , )(pg, base);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro.c
index e7520a504b121..723135bd1a124 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ro.c
@@ -201,3 +201,19 @@ svfloat32_t test_svld1ro_f32(svbool_t pg, const float32_t *base) {
 svfloat64_t test_svld1ro_f64(svbool_t pg, const float64_t *base) {
   return SVE_ACLE_FUNC(svld1ro, _f64, , )(pg, base);
 }
+
+// CHECK-LABEL: @test_svld1ro_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svld1ro_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1ro_bf16(svbool_t pg, const bfloat16_t *base) {
+  return SVE_ACLE_FUNC(svld1ro, _bf16, , )(pg, base);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq-bfloat.c
deleted file mode 100644
index d50b0269e5297..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq-bfloat.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld1rq_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svld1rq_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svld1rq_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld1rq,_bf16,,)(pg, base);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq.c
index 0e7455d413274..44dbb9696f154 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1rq.c
@@ -220,3 +220,20 @@ svfloat64_t test_svld1rq_f64(svbool_t pg, const float64_t *base) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svld1rq,_f64,,)(pg, base);
 }
+
+// CHECK-LABEL: @test_svld1rq_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svld1rq_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svld1rq_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld1rq,_bf16,,)(pg, base);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2-bfloat.c
deleted file mode 100644
index 5535b3d090d32..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2-bfloat.c
+++ /dev/null
@@ -1,60 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svld2_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-svbfloat16x2_t test_svld2_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld2,_bf16,,)(pg, base);
-}
-
-
-// CHECK-LABEL: @test_svld2_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svld2_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x2_t test_svld2_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld2_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2.c
index abe1c87b6f2c3..f785622af056c 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld2.c
@@ -4,8 +4,8 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -442,3 +442,40 @@ svmfloat8x2_t test_svld2_vnum_mf8(svbool_t pg, const mfloat8_t *base, int64_t vn
 {
   return SVE_ACLE_FUNC(svld2_vnum,_mf8,,)(pg, base, vnum);
 }
+
+// CHECK-LABEL: @test_svld2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld2_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+svbfloat16x2_t test_svld2_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld2,_bf16,,)(pg, base);
+}
+
+
+// CHECK-LABEL: @test_svld2_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld2_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x2_t test_svld2_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld2_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3-bfloat.c
deleted file mode 100644
index 41a367b737b0f..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3-bfloat.c
+++ /dev/null
@@ -1,60 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld3_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svld3_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-svbfloat16x3_t test_svld3_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld3,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svld3_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svld3_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x3_t test_svld3_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld3_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3.c
index 5ff7ad9de483b..2197775030766 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld3.c
@@ -4,8 +4,8 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -441,3 +441,39 @@ svmfloat8x3_t test_svld3_vnum_mf8(svbool_t pg, const mfloat8_t *base, int64_t vn
 {
   return SVE_ACLE_FUNC(svld3_vnum,_mf8,,)(pg, base, vnum);
 }
+
+// CHECK-LABEL: @test_svld3_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld3_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+svbfloat16x3_t test_svld3_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld3,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld3_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld3_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x3_t test_svld3_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld3_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4-bfloat.c
deleted file mode 100644
index a88e6f11a0510..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4-bfloat.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svld4_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svld4_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
-//
-svbfloat16x4_t test_svld4_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld4,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svld4_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z20test_svld4_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x4_t test_svld4_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svld4_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4.c
index 650fd5986be27..cd79dcee42bdb 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld4.c
@@ -4,8 +4,8 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -441,3 +441,39 @@ svmfloat8x4_t test_svld4_vnum_mf8(svbool_t pg, const mfloat8_t *base, int64_t vn
 {
   return SVE_ACLE_FUNC(svld4_vnum,_mf8,,)(pg, base, vnum);
 }
+
+// CHECK-LABEL: @test_svld4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svld4_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]]
+//
+svbfloat16x4_t test_svld4_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld4,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svld4_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svld4_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x4_t test_svld4_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svld4_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1-bfloat.c
deleted file mode 100644
index dce5839ebd759..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1-bfloat.c
+++ /dev/null
@@ -1,52 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svldff1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svldff1_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svldff1_bf16(svbool_t pg, const bfloat16_t *base)
-{
-  return SVE_ACLE_FUNC(svldff1,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svldff1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z22test_svldff1_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-svbfloat16_t test_svldff1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
-{
-  return SVE_ACLE_FUNC(svldff1_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1.c
index ba4091660bfae..461c620b21690 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldff1.c
@@ -1117,3 +1117,39 @@ svfloat32_t test_svldff1_gather_u32base_index_f32(svbool_t pg, svuint32_t bases,
 svfloat64_t test_svldff1_gather_u64base_index_f64(svbool_t pg, svuint64_t bases, int64_t index) {
   return SVE_ACLE_FUNC(svldff1_gather, _u64base, _index_f64, )(pg, bases, index);
 }
+
+// CHECK-LABEL: @test_svldff1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svldff1_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svldff1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+  return SVE_ACLE_FUNC(svldff1,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svldff1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svldff1_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldff1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svldff1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+  return SVE_ACLE_FUNC(svldff1_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1-bfloat.c
deleted file mode 100644
index a8ebc5d63d894..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1-bfloat.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svldnf1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svldnf1_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svldnf1_bf16(svbool_t pg, const bfloat16_t *base)
-{
-  return SVE_ACLE_FUNC(svldnf1,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svldnf1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z22test_svldnf1_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-svbfloat16_t test_svldnf1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
-{
-  return SVE_ACLE_FUNC(svldnf1_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1.c
index 8e738d839cd85..efdbe356f7c66 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnf1.c
@@ -433,3 +433,39 @@ svfloat64_t test_svldnf1_vnum_f64(svbool_t pg, const float64_t *base, int64_t vn
 {
   return SVE_ACLE_FUNC(svldnf1_vnum,_f64,,)(pg, base, vnum);
 }
+
+// CHECK-LABEL: @test_svldnf1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svldnf1_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svldnf1_bf16(svbool_t pg, const bfloat16_t *base)
+{
+  return SVE_ACLE_FUNC(svldnf1,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svldnf1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svldnf1_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnf1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svldnf1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
+{
+  return SVE_ACLE_FUNC(svldnf1_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1-bfloat.c
deleted file mode 100644
index 82d5bff2516d8..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1-bfloat.c
+++ /dev/null
@@ -1,60 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svldnt1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z17test_svldnt1_bf16u10__SVBool_tPKu6__bf16(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svldnt1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svldnt1,_bf16,,)(pg, base);
-}
-
-// CHECK-LABEL: @test_svldnt1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z22test_svldnt1_vnum_bf16u10__SVBool_tPKu6__bf16l(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
-//
-svbfloat16_t test_svldnt1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svldnt1_vnum,_bf16,,)(pg, base, vnum);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1.c
index b96bf0cb23d12..19919bf1d4af3 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ldnt1.c
@@ -442,3 +442,39 @@ svmfloat8_t test_svldnt1_vnum_mf8(svbool_t pg, const mfloat8_t *base, int64_t vn
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_mf8,,)(pg, base, vnum);
 }
+
+// CHECK-LABEL: @test_svldnt1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z17test_svldnt1_bf16u10__SVBool_tPKu6__bf16(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svldnt1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svldnt1,_bf16,,)(pg, base);
+}
+
+// CHECK-LABEL: @test_svldnt1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svldnt1_vnum_bf16u10__SVBool_tPKu6__bf16l(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svldnt1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svldnt1_vnum,_bf16,,)(pg, base, vnum);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len-bfloat.c
deleted file mode 100644
index 049207514bc1d..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len-bfloat.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svlen_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-// CHECK-NEXT:    ret i64 [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svlen_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
-//
-uint64_t test_svlen_bf16(svbfloat16_t op) MODE_ATTR
-{
-  // expected-warning@+1 {{implicit declaration of function 'svlen_bf16'}}
-  return SVE_ACLE_FUNC(svlen,_bf16,,)(op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len.c
index cca939296455e..5954e730864c4 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_len.c
@@ -208,3 +208,21 @@ uint64_t test_svlen_f64(svfloat64_t op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svlen,_f64,,)(op);
 }
+
+// CHECK-LABEL: @test_svlen_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svlen_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+// CPP-CHECK-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_svlen_bf16(svbfloat16_t op) MODE_ATTR
+{
+  // expected-warning@+1 {{implicit declaration of function 'svlen_bf16'}}
+  return SVE_ACLE_FUNC(svlen,_bf16,,)(op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret-bfloat.c
deleted file mode 100644
index 02704229292b2..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret-bfloat.c
+++ /dev/null
@@ -1,2562 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef TUPLE
-#define TYPE_1(base,tuple) base ## tuple ## _t
-#define TYPE_0(base,tuple) TYPE_1(base,tuple)
-#define TYPE(base) TYPE_0(base,TUPLE)
-#else
-#define TYPE(base) base ## _t
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
-#else
-#ifdef TUPLE
-#define SVE_ACLE_FUNC_1(A1,A2,T) A1##A2##_##T
-#define SVE_ACLE_FUNC_0(A1,A2,T) SVE_ACLE_FUNC_1(A1,A2,T)
-#define SVE_ACLE_FUNC(A1,A2) SVE_ACLE_FUNC_0(A1,A2,TUPLE)
-#else
-#define SVE_ACLE_FUNC(A1,A2) A1##A2
-#endif
-#endif
-
-// CHECK-LABEL: @test_svreinterpret_s8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_s8_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_s8_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_s8_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z26test_svreinterpret_s8_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
-//
-TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_s16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_s16_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_s16_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_s16_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s16_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
-//
-TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_s32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_s32_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_s32_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_s32_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s32_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
-//
-TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op);
-}
-// CHECK-LABEL: @test_svreinterpret_s64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_s64_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_s64_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_s64_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s64_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
-//
-TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_u8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_u8_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_u8_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_u8_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z26test_svreinterpret_u8_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
-//
-TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_u16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_u16_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_u16_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_u16_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u16_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
-//
-TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_u32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_u32_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_u32_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_u32_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u32_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
-//
-TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_u64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_u64_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_u64_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_u64_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u64_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
-//
-TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_s8(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_s8(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_s8(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_s8u10__SVInt8_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_s16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_s16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_s16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_s16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s16u11__SVInt16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_s32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_s32(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_s32(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_s32(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s32u11__SVInt32_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_s64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_s64(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_s64(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_s64(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s64u11__SVInt64_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_u8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_u8(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_u8(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_u8(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_u8u11__SVUint8_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_u16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_u16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_u16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u16u12__SVUint16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_u32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_u32(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_u32(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_u32(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u32u12__SVUint32_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_u64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_u64(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_u64(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_u64(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u64u12__SVUint64_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP2]], 0
-// TUPLE2-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE3-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE4-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], 2
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP10]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-CHECK-LABEL: @_Z28test_svreinterpret_bf16_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
-//
-// CPP-TUPLE2-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP2]], 0
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
-//
-// CPP-TUPLE3-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]]
-//
-// CPP-TUPLE4-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], 2
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP10]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_f16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_f16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_f16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_f16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f16u13__SVFloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_f32(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_f32(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_f32(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f32u13__SVFloat32_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_bf16_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_bf16_f64(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 8 x bfloat>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_bf16_f64(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 8 x bfloat>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_bf16_f64(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 8 x bfloat>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f64u13__SVFloat64_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 8 x bfloat>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 8 x bfloat>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 8 x bfloat>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
-//
-TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_f32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
-// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_f32_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x float>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x float>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_f32_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x float>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x float>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x float>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_f32_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x float>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x float>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x float>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x float>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f32_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x float>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x float>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x float>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x float>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x float>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x float>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x float>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x float>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x float>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
-//
-TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_f16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
-// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_f16_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x half>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x half>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_f16_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x half>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x half>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x half>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_f16_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x half>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x half>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x half>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x half>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f16_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x half>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x half>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x half>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x half>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x half>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x half>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x half>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x half>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x half>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
-//
-TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op);
-}
-
-// CHECK-LABEL: @test_svreinterpret_f64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
-// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-// TUPLE2-LABEL: @test_svreinterpret_f64_bf16(
-// TUPLE2-NEXT:  entry:
-// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x double>
-// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
-// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x double>
-// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
-// TUPLE2-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
-//
-// TUPLE3-LABEL: @test_svreinterpret_f64_bf16(
-// TUPLE3-NEXT:  entry:
-// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x double>
-// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
-// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x double>
-// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
-// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x double>
-// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
-// TUPLE3-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
-//
-// TUPLE4-LABEL: @test_svreinterpret_f64_bf16(
-// TUPLE4-NEXT:  entry:
-// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x double>
-// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
-// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x double>
-// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
-// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x double>
-// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
-// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x double>
-// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
-// TUPLE4-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
-//
-// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f64_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
-//
-// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x2_t(
-// CPP-TUPLE2-NEXT:  entry:
-// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x double>
-// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
-// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x double>
-// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
-// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
-//
-// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x3_t(
-// CPP-TUPLE3-NEXT:  entry:
-// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x double>
-// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
-// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x double>
-// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
-// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x double>
-// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
-// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
-//
-// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x4_t(
-// CPP-TUPLE4-NEXT:  entry:
-// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
-// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
-// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
-// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
-// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x double>
-// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
-// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x double>
-// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
-// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x double>
-// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
-// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x double>
-// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
-// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
-//
-TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
-  return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c
index 7c21c297b6a3d..c8d2d03588e1a 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_reinterpret.c
@@ -13101,3 +13101,2512 @@ TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f64)(op);
 }
+
+// CHECK-LABEL: @test_svreinterpret_s8_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svreinterpret_s8_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_s16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s16_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_s32_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s32_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op);
+}
+// CHECK-LABEL: @test_svreinterpret_s64_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_s64_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_u8_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svreinterpret_u8_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 16 x i8>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 16 x i8>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP4]], <vscale x 16 x i8> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]], <vscale x 16 x i8> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 16 x i8>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP8]], <vscale x 16 x i8> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 16 x i8>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]]
+//
+TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_u16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u16_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x i16>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x i16>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], <vscale x 8 x i16> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]], <vscale x 8 x i16> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x i16>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP8]], <vscale x 8 x i16> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP6]], <vscale x 8 x i16> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP9]], <vscale x 8 x i16> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x i16>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP12]], <vscale x 8 x i16> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP15]]
+//
+TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_u32_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u32_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x i32>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x i32>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], <vscale x 4 x i32> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x i32>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP8]], <vscale x 4 x i32> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP6]], <vscale x 4 x i32> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], <vscale x 4 x i32> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x i32>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP12]], <vscale x 4 x i32> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP15]]
+//
+TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_u64_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_u64_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x i64>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x i64>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], <vscale x 2 x i64> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]], <vscale x 2 x i64> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x i64>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP8]], <vscale x 2 x i64> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP6]], <vscale x 2 x i64> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP9]], <vscale x 2 x i64> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x i64>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP12]], <vscale x 2 x i64> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP15]]
+//
+TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_s8u10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s16u11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s32u11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s64u11__SVInt64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_u8u11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i8> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 16 x i8> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 16 x i8> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 16 x i8> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 16 x i8> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 16 x i8> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u16u12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x i16> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x i16> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x i16> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x i16> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x i16> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x i16> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u32u12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x i32> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x i32> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x i32> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x i32> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x i32> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x i32> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x i32> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u64u12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x i64> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x i64> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x i64> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x i64> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x i64> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x i64> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x i64> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x i64> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP2]], 0
+// TUPLE2-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE3-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE4-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], 2
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP10]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-CHECK-LABEL: @_Z28test_svreinterpret_bf16_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
+//
+// CPP-TUPLE2-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP2]], 0
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]]
+//
+// CPP-TUPLE3-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]]
+//
+// CPP-TUPLE4-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]], <vscale x 8 x bfloat> [[TMP8]], 2
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP10]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f16u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x half> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x half> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x half> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x half> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x half> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x half> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x half> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x half> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x half> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f32u13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x float> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 4 x float> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 4 x float> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 4 x float> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 4 x float> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x float> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 4 x float> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 4 x float> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 4 x float> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_bf16_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 8 x bfloat>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 8 x bfloat>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 8 x bfloat>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f64u13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 2 x double> [[TMP2]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 2 x double> [[TMP5]] to <vscale x 8 x bfloat>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], <vscale x 8 x bfloat> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 2 x double> [[TMP3]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 2 x double> [[TMP6]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP5]], <vscale x 8 x bfloat> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 2 x double> [[TMP9]] to <vscale x 8 x bfloat>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP8]], <vscale x 8 x bfloat> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 2 x double> [[TMP4]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 2 x double> [[TMP7]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP6]], <vscale x 8 x bfloat> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 2 x double> [[TMP10]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP9]], <vscale x 8 x bfloat> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 2 x double> [[TMP13]] to <vscale x 8 x bfloat>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP12]], <vscale x 8 x bfloat> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP15]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_f32_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x float>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x float>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x float>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x float>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x float>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x float>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x float>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x float>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x float>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f32_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 4 x float>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 4 x float>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], <vscale x 4 x float> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 4 x float>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 4 x float>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], <vscale x 4 x float> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 4 x float>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], <vscale x 4 x float> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 4 x float>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 4 x float>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP6]], <vscale x 4 x float> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 4 x float>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP9]], <vscale x 4 x float> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 4 x float>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], <vscale x 4 x float> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_f16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x half>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x half>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x half>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x half>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x half>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x half>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x half>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x half>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x half>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f16_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 8 x half>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 8 x half>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], <vscale x 8 x half> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 8 x half>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 8 x half>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP5]], <vscale x 8 x half> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 8 x half>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP8]], <vscale x 8 x half> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 8 x half>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 8 x half>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP6]], <vscale x 8 x half> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 8 x half>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP9]], <vscale x 8 x half> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 8 x half>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP12]], <vscale x 8 x half> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP15]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op);
+}
+
+// CHECK-LABEL: @test_svreinterpret_f64_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+// TUPLE2-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x double>
+// TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
+// TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x double>
+// TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
+// TUPLE2-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x double>
+// TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
+// TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x double>
+// TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
+// TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x double>
+// TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
+// TUPLE3-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x double>
+// TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
+// TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x double>
+// TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
+// TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x double>
+// TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
+// TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x double>
+// TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
+// TUPLE4-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svreinterpret_f64_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE2-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE2-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-TUPLE2-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP2]] to <vscale x 2 x double>
+// CPP-TUPLE2-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP3]], 0
+// CPP-TUPLE2-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-TUPLE2-NEXT:    [[TMP6:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP5]] to <vscale x 2 x double>
+// CPP-TUPLE2-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], <vscale x 2 x double> [[TMP6]], 1
+// CPP-TUPLE2-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP7]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE3-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE3-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE3-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-TUPLE3-NEXT:    [[TMP4:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP3]] to <vscale x 2 x double>
+// CPP-TUPLE3-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP4]], 0
+// CPP-TUPLE3-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-TUPLE3-NEXT:    [[TMP7:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP6]] to <vscale x 2 x double>
+// CPP-TUPLE3-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], <vscale x 2 x double> [[TMP7]], 1
+// CPP-TUPLE3-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-TUPLE3-NEXT:    [[TMP10:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP9]] to <vscale x 2 x double>
+// CPP-TUPLE3-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], <vscale x 2 x double> [[TMP10]], 2
+// CPP-TUPLE3-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP11]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[OP_COERCE0:%.*]], 0
+// CPP-TUPLE4-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[OP_COERCE1:%.*]], 1
+// CPP-TUPLE4-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[OP_COERCE2:%.*]], 2
+// CPP-TUPLE4-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[OP_COERCE3:%.*]], 3
+// CPP-TUPLE4-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-TUPLE4-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP4]] to <vscale x 2 x double>
+// CPP-TUPLE4-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[TMP5]], 0
+// CPP-TUPLE4-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-TUPLE4-NEXT:    [[TMP8:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP7]] to <vscale x 2 x double>
+// CPP-TUPLE4-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP6]], <vscale x 2 x double> [[TMP8]], 1
+// CPP-TUPLE4-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-TUPLE4-NEXT:    [[TMP11:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP10]] to <vscale x 2 x double>
+// CPP-TUPLE4-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP9]], <vscale x 2 x double> [[TMP11]], 2
+// CPP-TUPLE4-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-TUPLE4-NEXT:    [[TMP14:%.*]] = bitcast <vscale x 8 x bfloat> [[TMP13]] to <vscale x 2 x double>
+// CPP-TUPLE4-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], <vscale x 2 x double> [[TMP14]], 3
+// CPP-TUPLE4-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP15]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev-bfloat.c
deleted file mode 100644
index 9b3e813fa9694..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svrev_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.rev.nxv8bf16(<vscale x 8 x bfloat> [[OP:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svrev_bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.rev.nxv8bf16(<vscale x 8 x bfloat> [[OP:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svrev_bf16(svbfloat16_t op) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svrev,_bf16,,)(op);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev.c
index 3c0ae7df79644..839eee402d4b8 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_rev.c
@@ -246,3 +246,18 @@ svbool_t test_svrev_b64(svbool_t op) MODE_ATTR
 {
   return svrev_b64(op);
 }
+
+// CHECK-LABEL: @test_svrev_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.rev.nxv8bf16(<vscale x 8 x bfloat> [[OP:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svrev_bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.rev.nxv8bf16(<vscale x 8 x bfloat> [[OP:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svrev_bf16(svbfloat16_t op) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svrev,_bf16,,)(op);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel-bfloat.c
deleted file mode 100644
index 82be1904f6770..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel-bfloat.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svsel_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]]
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svsel_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]]
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svsel_bf16(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  // expected-warning@+1 {{implicit declaration of function 'svsel_bf16'}}
-  return SVE_ACLE_FUNC(svsel,_bf16,,)(pg, op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel.c
index 9cf7f4d7f45cc..62c63d0c18b8a 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_sel.c
@@ -219,3 +219,21 @@ svbool_t test_svsel_b(svbool_t pg, svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svsel,_b,,)(pg, op1, op2);
 }
+
+// CHECK-LABEL: @test_svsel_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]]
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svsel_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = select <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]]
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsel_bf16(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  // expected-warning@+1 {{implicit declaration of function 'svsel_bf16'}}
+  return SVE_ACLE_FUNC(svsel,_bf16,,)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2-bfloat.c
deleted file mode 100644
index edd30278a9714..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2-bfloat.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svset2_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset2_bf16_014svbfloat16x2_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 0, x);
-}
-
-// CHECK-LABEL: @test_svset2_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset2_bf16_114svbfloat16x2_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
-//
-svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 1, x);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2.c
index bdf57b42c8d8b..661591aded044 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set2.c
@@ -249,3 +249,41 @@ svmfloat8x2_t test_svset2_mf8(svmfloat8x2_t tuple, svmfloat8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_mf8,,)(tuple, 1, x);
 }
+
+// CHECK-LABEL: @test_svset2_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset2_bf16_014svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 0, x);
+}
+
+// CHECK-LABEL: @test_svset2_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset2_bf16_114svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]]
+//
+svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 1, x);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3-bfloat.c
deleted file mode 100644
index 35cf63c44b6aa..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3-bfloat.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svset3_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_014svbfloat16x3_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 0, x);
-}
-
-// CHECK-LABEL: @test_svset3_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_114svbfloat16x3_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 1, x);
-}
-
-// CHECK-LABEL: @test_svset3_bf16_2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 2
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_214svbfloat16x3_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 2
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
-//
-svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 2, x);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3.c
index 34f7a9ae6a38c..5eebd63231b1a 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set3.c
@@ -275,3 +275,66 @@ svmfloat8x3_t test_svset3_mf8(svmfloat8x3_t tuple, svmfloat8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_mf8,,)(tuple, 1, x);
 }
+
+// CHECK-LABEL: @test_svset3_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_014svbfloat16x3_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 0, x);
+}
+
+// CHECK-LABEL: @test_svset3_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_114svbfloat16x3_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 1, x);
+}
+
+// CHECK-LABEL: @test_svset3_bf16_2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset3_bf16_214svbfloat16x3_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[X:%.*]], 2
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]]
+//
+svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 2, x);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4-bfloat.c
deleted file mode 100644
index 2f6035e6a88cc..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4-bfloat.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#ifdef __ARM_FEATURE_SME
-#define ATTR __arm_streaming
-#else
-#define ATTR
-#endif
-
-// CHECK-LABEL: @test_svset4_bf16_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_014svbfloat16x4_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 0
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 0, x);
-}
-
-// CHECK-LABEL: @test_svset4_bf16_1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_114svbfloat16x4_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 1
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 1, x);
-}
-
-// CHECK-LABEL: @test_svset4_bf16_2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 2
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_214svbfloat16x4_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 2
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 2, x);
-}
-
-// CHECK-LABEL: @test_svset4_bf16_3(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 3
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_314svbfloat16x4_tu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 3
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
-//
-svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
-{
-  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 3, x);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4.c
index 06df3c1daee67..fe54445010743 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_set4.c
@@ -297,3 +297,95 @@ svmfloat8x4_t test_svset4_mf8(svmfloat8x4_t tuple, svmfloat8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_mf8,,)(tuple, 1, x);
 }
+
+// CHECK-LABEL: @test_svset4_bf16_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_014svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 0
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 0, x);
+}
+
+// CHECK-LABEL: @test_svset4_bf16_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_114svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 1
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 1, x);
+}
+
+// CHECK-LABEL: @test_svset4_bf16_2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 2
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_214svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 2
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 2, x);
+}
+
+// CHECK-LABEL: @test_svset4_bf16_3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 3
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svset4_bf16_314svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[TUPLE_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[TUPLE_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[TUPLE_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[TUPLE_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], <vscale x 8 x bfloat> [[X:%.*]], 3
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]]
+//
+svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
+{
+  return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 3, x);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice-bfloat.c
deleted file mode 100644
index 266e32fb59dd3..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice-bfloat.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svsplice_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.splice.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z18test_svsplice_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.splice.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
-//
-svbfloat16_t test_svsplice_bf16(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  // expected-warning@+1 {{implicit declaration of function 'svsplice_bf16'}}
-  return SVE_ACLE_FUNC(svsplice,_bf16,,)(pg, op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice.c
index fe4e910e37aae..58d194507a175 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_splice.c
@@ -204,3 +204,21 @@ svfloat64_t test_svsplice_f64(svbool_t pg, svfloat64_t op1, svfloat64_t op2) MOD
 {
   return SVE_ACLE_FUNC(svsplice,_f64,,)(pg, op1, op2);
 }
+
+// CHECK-LABEL: @test_svsplice_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.splice.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z18test_svsplice_bf16u10__SVBool_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.splice.nxv8bf16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
+//
+svbfloat16_t test_svsplice_bf16(svbool_t pg, svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  // expected-warning@+1 {{implicit declaration of function 'svsplice_bf16'}}
+  return SVE_ACLE_FUNC(svsplice,_bf16,,)(pg, op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1-bfloat.c
deleted file mode 100644
index 1d194626418a2..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1-bfloat.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svst1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z15test_svst1_bf16u10__SVBool_tPu6__bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst1,_bf16,,)(pg, base, data);
-}
-
-// CHECK-LABEL: @test_svst1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
-// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z20test_svst1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
-// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst1_vnum,_bf16,,)(pg, base, vnum, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
index 21350007da86f..56f8c32c23099 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
@@ -1243,3 +1243,45 @@ void test_svst1_scatter_u64base_index_f64(svbool_t pg, svuint64_t bases, int64_t
 }
 
 #endif
+
+// CHECK-LABEL: @test_svst1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst1_bf16u10__SVBool_tPu6__bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst1,_bf16,,)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst1_vnum,_bf16,,)(pg, base, vnum, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2-bfloat.c
deleted file mode 100644
index 726aae2fa78a1..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2-bfloat.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-// CHECK-LABEL: @test_svst2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data);
-}
-
-// CHECK-LABEL: @test_svst2_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst2_vnum,_bf16,,)(pg, base, vnum, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2.c
index 9e73e4464c6f9..cf6d1be579472 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st2.c
@@ -633,3 +633,55 @@ void test_svst2_vnum_mf8(svbool_t pg, mfloat8_t *base, int64_t vnum, svmfloat8x2
 {
   return SVE_ACLE_FUNC(svst2_vnum,_mf8,,)(pg, base, vnum, data);
 }
+
+// CHECK-LABEL: @test_svst2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst2_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 0
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst2_vnum,_bf16,,)(pg, base, vnum, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3-bfloat.c
deleted file mode 100644
index 2a71029a8f573..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3-bfloat.c
+++ /dev/null
@@ -1,84 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svst3_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data);
-}
-
-// CHECK-LABEL: @test_svst3_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[TMP7]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[TMP7]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst3_vnum,_bf16,,)(pg, base, vnum, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3.c
index b693b693b1ebb..6241e709854c9 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st3.c
@@ -729,3 +729,63 @@ void test_svst3_vnum_mf8(svbool_t pg, mfloat8_t *base, int64_t vnum, svmfloat8x3
 {
   return SVE_ACLE_FUNC(svst3_vnum,_mf8,,)(pg, base, vnum, data);
 }
+
+// CHECK-LABEL: @test_svst3_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst3_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP6]], ptr [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst3_vnum,_bf16,,)(pg, base, vnum, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4-bfloat.c
deleted file mode 100644
index 1f4c4fde8ad1b..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4-bfloat.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svst4_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data);
-}
-
-// CHECK-LABEL: @test_svst4_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[TMP9]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[TMP9]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svst4_vnum,_bf16,,)(pg, base, vnum, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4.c
index f8c3b60682573..0536a8d265b4c 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st4.c
@@ -825,3 +825,71 @@ void test_svst4_vnum_mf8(svbool_t pg, mfloat8_t *base, int64_t vnum, svmfloat8x4
 {
   return SVE_ACLE_FUNC(svst4_vnum,_mf8,,)(pg, base, vnum, data);
 }
+
+// CHECK-LABEL: @test_svst4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svst4_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[TMP9]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], 0
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP1]], <vscale x 8 x bfloat> [[DATA_COERCE2:%.*]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], <vscale x 8 x bfloat> [[DATA_COERCE3:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], <vscale x 8 x i1> [[TMP8]], ptr [[TMP9]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svst4_vnum,_bf16,,)(pg, base, vnum, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1-bfloat.c
deleted file mode 100644
index c13f7d82caac0..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1-bfloat.c
+++ /dev/null
@@ -1,60 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svstnt1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z17test_svstnt1_bf16u10__SVBool_tPu6__bf16u14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svstnt1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svstnt1,_bf16,,)(pg, base, data);
-}
-
-// CHECK-LABEL: @test_svstnt1_vnum_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret void
-//
-// CPP-CHECK-LABEL: @_Z22test_svstnt1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret void
-//
-void test_svstnt1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svstnt1_vnum,_bf16,,)(pg, base, vnum, data);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1.c
index f739ea5dca641..4f6b5000e82f1 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_stnt1.c
@@ -442,3 +442,39 @@ void test_svstnt1_vnum_mf8(svbool_t pg, mfloat8_t *base, int64_t vnum, svmfloat8
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_mf8,,)(pg, base, vnum, data);
 }
+
+// CHECK-LABEL: @test_svstnt1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z17test_svstnt1_bf16u10__SVBool_tPu6__bf16u14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svstnt1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svstnt1,_bf16,,)(pg, base, data);
+}
+
+// CHECK-LABEL: @test_svstnt1_vnum_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svstnt1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svstnt1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svstnt1_vnum,_bf16,,)(pg, base, vnum, data);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl-bfloat.c
deleted file mode 100644
index d4b6b6842fb9a..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtbl_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svtbl_bf16u14__SVBfloat16_tu12__SVUint16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtbl_bf16(svbfloat16_t data, svuint16_t indices) MODE_ATTR {
-  // expected-warning@+1 {{implicit declaration of function 'svtbl_bf16'}}
-  return SVE_ACLE_FUNC(svtbl, _bf16, , )(data, indices);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl.c
index 89fa47b5f7974..607a38933623a 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_tbl.c
@@ -186,3 +186,18 @@ svfloat64_t test_svtbl_f64(svfloat64_t data, svuint64_t indices) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svtbl,_f64,,)(data, indices);
 }
+
+// CHECK-LABEL: @test_svtbl_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svtbl_bf16u14__SVBfloat16_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtbl_bf16(svbfloat16_t data, svuint16_t indices) MODE_ATTR {
+  // expected-warning@+1 {{implicit declaration of function 'svtbl_bf16'}}
+  return SVE_ACLE_FUNC(svtbl, _bf16, , )(data, indices);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-bfloat.c
deleted file mode 100644
index a6c6dcc571e28..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtrn1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svtrn1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtrn1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svtrn1,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64-bfloat.c
deleted file mode 100644
index ea1d515a9b4b2..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtrn1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svtrn1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtrn1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svtrn1q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64.c
index 04028c7850ce3..ab1d2e6c52a77 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svtrn1_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svtrn1_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svtrn1q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svtrn1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svtrn1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtrn1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svtrn1q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1.c
index f6d8ff770c600..bde209e94d230 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn1.c
@@ -246,3 +246,18 @@ svbool_t test_svtrn1_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svtrn1_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svtrn1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svtrn1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtrn1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svtrn1,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-bfloat.c
deleted file mode 100644
index 87063ac69dfac..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtrn2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svtrn2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtrn2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svtrn2,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64-bfloat.c
deleted file mode 100644
index d8171cc240550..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtrn2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svtrn2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtrn2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svtrn2q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64.c
index e9859192333f8..00878e423d92b 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svtrn2_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svtrn2_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svtrn2q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svtrn2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svtrn2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtrn2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svtrn2q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2.c
index 9442142bc097f..838fc9bf53e2a 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_trn2.c
@@ -246,3 +246,18 @@ svbool_t test_svtrn2_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svtrn2_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svtrn2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svtrn2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.trn2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtrn2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svtrn2,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef-bfloat.c
deleted file mode 100644
index b15028c4b2629..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef-bfloat.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-// CHECK-LABEL: @test_svundef_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> undef
-//
-// CPP-CHECK-LABEL: @_Z17test_svundef_bf16v(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> undef
-//
-svbfloat16_t test_svundef_bf16(void) MODE_ATTR
-{
-  return svundef_bf16();
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef.c
index 9b4caa7648926..0109c5f348602 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef.c
@@ -168,3 +168,16 @@ svmfloat8_t test_svundef_mf8(void) MODE_ATTR
 {
   return svundef_mf8();
 }
+
+// CHECK-LABEL: @test_svundef_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> undef
+//
+// CPP-CHECK-LABEL: @_Z17test_svundef_bf16v(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> undef
+//
+svbfloat16_t test_svundef_bf16(void) MODE_ATTR
+{
+  return svundef_bf16();
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2-bfloat.c
deleted file mode 100644
index e7325a25cd33b..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2-bfloat.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-// CHECK-LABEL: @test_svundef2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-// CPP-CHECK-LABEL: @_Z18test_svundef2_bf16v(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-svbfloat16x2_t test_svundef2_bf16(void) MODE_ATTR
-{
-  return svundef2_bf16();
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2.c
index 27e4caffa348d..24b0f32100970 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef2.c
@@ -2,8 +2,8 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -169,3 +169,16 @@ svmfloat8x2_t test_svundef2_mf8(void) MODE_ATTR
 {
   return svundef2_mf8();
 }
+
+// CHECK-LABEL: @test_svundef2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+// CPP-CHECK-LABEL: @_Z18test_svundef2_bf16v(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+svbfloat16x2_t test_svundef2_bf16(void) MODE_ATTR
+{
+  return svundef2_bf16();
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3-bfloat.c
deleted file mode 100644
index 7a35431daead1..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3-bfloat.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-// CHECK-LABEL: @test_svundef3_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-// CPP-CHECK-LABEL: @_Z18test_svundef3_bf16v(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-svbfloat16x3_t test_svundef3_bf16(void) MODE_ATTR
-{
-  return svundef3_bf16();
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3.c
index 767114fb7984d..26dc671043ac0 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef3.c
@@ -2,8 +2,8 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -169,3 +169,16 @@ svmfloat8x3_t test_svundef3_mf8(void) MODE_ATTR
 {
   return svundef3_mf8();
 }
+
+// CHECK-LABEL: @test_svundef3_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+// CPP-CHECK-LABEL: @_Z18test_svundef3_bf16v(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+svbfloat16x3_t test_svundef3_bf16(void) MODE_ATTR
+{
+  return svundef3_bf16();
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4-bfloat.c
deleted file mode 100644
index 431b82bb5f106..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4-bfloat.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-// CHECK-LABEL: @test_svundef4_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-// CPP-CHECK-LABEL: @_Z18test_svundef4_bf16v(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
-//
-svbfloat16x4_t test_svundef4_bf16(void) MODE_ATTR
-{
-  return svundef4_bf16();
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4.c
index fe0b8965ed0e7..fb25f20b0f5fd 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_undef4.c
@@ -2,8 +2,8 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -169,3 +169,16 @@ svmfloat8x4_t test_svundef4_mf8(void) MODE_ATTR
 {
   return svundef4_mf8();
 }
+
+// CHECK-LABEL: @test_svundef4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+// CPP-CHECK-LABEL: @_Z18test_svundef4_bf16v(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } undef
+//
+svbfloat16x4_t test_svundef4_bf16(void) MODE_ATTR
+{
+  return svundef4_bf16();
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-bfloat.c
deleted file mode 100644
index 91863a400480a..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svuzp1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svuzp1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svuzp1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svuzp1,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c
deleted file mode 100644
index 0ce3685db5071..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svuzp1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svuzp1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svuzp1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svuzp1q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64.c
index c8e36d8ca2bb0..11b410da0dc8b 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svuzp1_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svuzp1_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svuzp1q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svuzp1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svuzp1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svuzp1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svuzp1q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1.c
index 3581c04db0858..90865a07bd307 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp1.c
@@ -246,3 +246,18 @@ svbool_t test_svuzp1_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svuzp1_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svuzp1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svuzp1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svuzp1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svuzp1,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-bfloat.c
deleted file mode 100644
index 83ba97e40d527..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svuzp2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svuzp2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svuzp2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svuzp2,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c
deleted file mode 100644
index 2cd16655fbb15..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svuzp2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svuzp2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svuzp2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svuzp2q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64.c
index c5ced8c80c982..087bb5e7c8173 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svuzp2_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svuzp2_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svuzp2q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svuzp2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svuzp2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svuzp2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svuzp2q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2.c
index 959afbd72e090..8e00703720557 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_uzp2.c
@@ -246,3 +246,18 @@ svbool_t test_svuzp2_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svuzp2_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svuzp2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svuzp2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.uzp2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svuzp2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svuzp2,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-bfloat.c
deleted file mode 100644
index 31d5e34e3cd84..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svzip1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svzip1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svzip1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svzip1,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64-bfloat.c
deleted file mode 100644
index f7ea7e177d69a..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svzip1_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svzip1_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svzip1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svzip1q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64.c
index b5aef2270c3cc..2dd0bb6b0f1a5 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svzip1_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svzip1_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svzip1q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svzip1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svzip1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svzip1_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svzip1q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1.c
index 95a0f499248f1..5984b4be3a93e 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip1.c
@@ -246,3 +246,18 @@ svbool_t test_svzip1_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svzip1_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svzip1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svzip1_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip1.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svzip1_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svzip1,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-bfloat.c
deleted file mode 100644
index d750fae041840..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-bfloat.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-
-#include <arm_sve.h>
-
-#if defined __ARM_FEATURE_SME
-#define MODE_ATTR __arm_streaming
-#else
-#define MODE_ATTR
-#endif
-
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svzip2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svzip2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svzip2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
-{
-  return SVE_ACLE_FUNC(svzip2,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64-bfloat.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64-bfloat.c
deleted file mode 100644
index b3ffe987830ed..0000000000000
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64-bfloat.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -target-feature +f64mm -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svzip2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svzip2_bf16u14__SVBfloat16_tS_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svzip2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
-  return SVE_ACLE_FUNC(svzip2q, _bf16, , )(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64.c
index a890d8c77e9e6..4161e3f7ca958 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2-fp64.c
@@ -168,3 +168,17 @@ svfloat32_t test_svzip2_f32(svfloat32_t op1, svfloat32_t op2) {
 svfloat64_t test_svzip2_f64(svfloat64_t op1, svfloat64_t op2) {
   return SVE_ACLE_FUNC(svzip2q, _f64, , )(op1, op2);
 }
+
+// CHECK-LABEL: @test_svzip2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svzip2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2q.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svzip2_bf16(svbfloat16_t op1, svbfloat16_t op2) {
+  return SVE_ACLE_FUNC(svzip2q, _bf16, , )(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2.c
index 5b19cfd673f3d..b86871bc7a8e0 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_zip2.c
@@ -246,3 +246,18 @@ svbool_t test_svzip2_b64(svbool_t op1, svbool_t op2) MODE_ATTR
 {
   return svzip2_b64(op1, op2);
 }
+
+// CHECK-LABEL: @test_svzip2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svzip2_bf16u14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.zip2.nxv8bf16(<vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svzip2_bf16(svbfloat16_t op1, svbfloat16_t op2) MODE_ATTR
+{
+  return SVE_ACLE_FUNC(svzip2,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
index 82e318a7460c2..22a9c6fbf7b03 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_luti.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sme -target-feature +sme2 -target-feature +lut -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -Wall -o /dev/null %s
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -O1 -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_revd.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_revd.c
index 2143f27f95e45..a454511576241 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_revd.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_revd.c
@@ -1,17 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +bf16 -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +bf16 -target-feature +sve -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN:   -target-feature +sve -target-feature +sme -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2-bfloat.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2-bfloat.c
deleted file mode 100644
index 96af8c0bfa97d..0000000000000
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2-bfloat.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtbl2_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z16test_svtbl2_bf1614svbfloat16x2_tu12__SVUint16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtbl2_bf16(svbfloat16x2_t data, svuint16_t indices) {
-  return SVE_ACLE_FUNC(svtbl2, _bf16, , )(data, indices);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2.c
index 999a87fcfbbc8..a1ae50203b853 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbl2.c
@@ -179,3 +179,17 @@ svfloat64_t test_svtbl2_f64(svfloat64x2_t data, svuint64_t indices)
 {
   return SVE_ACLE_FUNC(svtbl2,_f64,,)(data, indices);
 }
+
+// CHECK-LABEL: @test_svtbl2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z16test_svtbl2_bf1614svbfloat16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> [[DATA_COERCE0:%.*]], <vscale x 8 x bfloat> [[DATA_COERCE1:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtbl2_bf16(svbfloat16x2_t data, svuint16_t indices) {
+  return SVE_ACLE_FUNC(svtbl2, _bf16, , )(data, indices);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx-bfloat.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx-bfloat.c
deleted file mode 100644
index c39d644e1b84f..0000000000000
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx-bfloat.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svtbx_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbx.nxv8bf16(<vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z15test_svtbx_bf16u14__SVBfloat16_tS_u12__SVUint16_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbx.nxv8bf16(<vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svtbx_bf16(svbfloat16_t fallback, svbfloat16_t data, svuint16_t indices) {
-  return SVE_ACLE_FUNC(svtbx, _bf16, , )(fallback, data, indices);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx.c
index d1e5893e52d3e..3cef7d66b6a2e 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_tbx.c
@@ -179,3 +179,17 @@ svfloat64_t test_svtbx_f64(svfloat64_t fallback, svfloat64_t data, svuint64_t in
 {
   return SVE_ACLE_FUNC(svtbx,_f64,,)(fallback, data, indices);
 }
+
+// CHECK-LABEL: @test_svtbx_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbx.nxv8bf16(<vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z15test_svtbx_bf16u14__SVBfloat16_tS_u12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbx.nxv8bf16(<vscale x 8 x bfloat> [[FALLBACK:%.*]], <vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 8 x i16> [[INDICES:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svtbx_bf16(svbfloat16_t fallback, svbfloat16_t data, svuint16_t indices) {
+  return SVE_ACLE_FUNC(svtbx, _bf16, , )(fallback, data, indices);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw-bfloat.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw-bfloat.c
deleted file mode 100644
index 95b0f53abdce0..0000000000000
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw-bfloat.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svwhilerw_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilerw.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svwhilerw_bf16PKu6__bf16S0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilerw.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
-//
-svbool_t test_svwhilerw_bf16(const bfloat16_t *op1, const bfloat16_t *op2)
-{
-  return SVE_ACLE_FUNC(svwhilerw,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw.c
index 13f1984db94cc..59462657693bf 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilerw.c
@@ -197,3 +197,20 @@ svbool_t test_svwhilerw_f64(const float64_t *op1, const float64_t *op2)
 {
   return SVE_ACLE_FUNC(svwhilerw,_f64,,)(op1, op2);
 }
+
+// CHECK-LABEL: @test_svwhilerw_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilerw.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svwhilerw_bf16PKu6__bf16S0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilerw.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+svbool_t test_svwhilerw_bf16(const bfloat16_t *op1, const bfloat16_t *op2)
+{
+  return SVE_ACLE_FUNC(svwhilerw,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr-bfloat.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr-bfloat.c
deleted file mode 100644
index 647f2aef98d81..0000000000000
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr-bfloat.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +bf16 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// CHECK-LABEL: @test_svwhilewr_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
-// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
-//
-// CPP-CHECK-LABEL: @_Z19test_svwhilewr_bf16PKu6__bf16S0_(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
-//
-svbool_t test_svwhilewr_bf16(const bfloat16_t *op1, const bfloat16_t *op2)
-{
-  return SVE_ACLE_FUNC(svwhilewr,_bf16,,)(op1, op2);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr.c
index fddede6a4dc09..469e299f6dd47 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_whilewr.c
@@ -197,3 +197,20 @@ svbool_t test_svwhilewr_f64(const float64_t *op1, const float64_t *op2)
 {
   return SVE_ACLE_FUNC(svwhilewr,_f64,,)(op1, op2);
 }
+
+// CHECK-LABEL: @test_svwhilewr_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svwhilewr_bf16PKu6__bf16S0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.whilewr.h.nxv8i1.p0(ptr [[OP1:%.*]], ptr [[OP2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+svbool_t test_svwhilewr_bf16(const bfloat16_t *op1, const bfloat16_t *op2)
+{
+  return SVE_ACLE_FUNC(svwhilewr,_bf16,,)(op1, op2);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfadd.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfadd.c
index 0f3b92f81cdee..9d45b829c39bd 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfadd.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfadd.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmax.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmax.c
index cc3207aba01e4..a00ca93f9a6af 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmax.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmax.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
index 7983943af3d89..0ad3b448ee2cf 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmaxnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmin.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmin.c
index 97159f119375a..fcd71acb38327 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmin.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmin.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfminnm.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfminnm.c
index 4cadbdc2b4d88..e261eb4b8113c 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfminnm.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfminnm.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmla.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmla.c
index 720853fccb650..fe6f19e9370a0 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmla.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmla.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmls.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmls.c
index 98c4dc95fecdf..4c1f8d2676625 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmls.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmls.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
index ad1128fd2b896..83afd04880bbc 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
@@ -2,15 +2,15 @@
 // REQUIRES: aarch64-registered-target
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmul.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmul.c
index e9443e35cee57..2939b684be744 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmul.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfmul.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfsub.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfsub.c
index a58f0046e61f3..a27c316b53bd1 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfsub.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_bfsub.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sve-b16b16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_cntp.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_cntp.c
index 1de597fef1f45..ae450d7bbaf1d 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_cntp.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_cntp.c
@@ -3,10 +3,10 @@
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create2_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create2_bool.c
index f416fe2b1b6ef..1574e4c12c753 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create2_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create2_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create4_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create4_bool.c
index 0026124deaae8..a145c2df97660 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create4_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_create4_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dot.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dot.c
index 8543385dd263d..704a245ed1460 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dot.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dot.c
@@ -2,14 +2,14 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined(__ARM_FEATURE_SME) && defined(__ARM_FEATURE_SVE)
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
index c9ddb324d843b..f172d71b4f42a 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_dupq.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
index 38769aeee8b2b..3e71ba068b8b9 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_fclamp.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_fclamp.c
index 8fbcc43678c7d..ef2bfd02b077d 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_fclamp.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_fclamp.c
@@ -16,7 +16,7 @@
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sve \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sve \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get2_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get2_bool.c
index cb30296cb618a..844e8c5791a0c 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get2_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get2_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get4_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get4_bool.c
index 611fc061f810e..5553c8669a015 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get4_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_get4_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
index ee5c2c592c61d..af39be3c8c06e 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wno-unknown-attributes -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 692af131e69de..02c7586f15122 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_load_struct.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_load_struct.c
index 5119bc085bc78..3e39fac2ce3a1 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_load_struct.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_load_struct.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_loads.c
index 0a87469887df9..b3cf65a576fab 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pext.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pext.c
index deb126236ad57..15c01b9db8afa 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pext.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pext.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pfalse.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pfalse.c
index 5c83789a19505..fdf9c498341fa 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pfalse.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_pfalse.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel.c
index d2fff9f3002b7..8b7761e981c96 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel.c
@@ -9,8 +9,8 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +bf16 -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN:   -target-feature +sme -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel_svcount.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
index 618836b044c77..589c540dcafb6 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_psel_svcount.c
@@ -11,8 +11,8 @@
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
-// RUN:   -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN:   -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ptrue.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ptrue.c
index aeaf4d7ae0e4c..73025d5df5c69 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ptrue.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_ptrue.c
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qcvtn.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qcvtn.c
index f80f7455c539a..091a17ec1bc76 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qcvtn.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qcvtn.c
@@ -2,17 +2,17 @@
 
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qrshr.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qrshr.c
index baa4acf6ec6c9..b3a33190fc4fa 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qrshr.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_qrshr.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_sclamp.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_sclamp.c
index 07b77dbb1378c..51e522d6f2625 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_sclamp.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_sclamp.c
@@ -10,13 +10,13 @@
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sve \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set2_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set2_bool.c
index 08b9094a0082d..3d8490188b130 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set2_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set2_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64  -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set4_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set4_bool.c
index 9b20d23d0fe84..8bc8db0717b57 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set4_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_set4_bool.c
@@ -5,18 +5,18 @@
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \
 // RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
index e71e68114a5af..092f31ba8491a 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 1544260377a20..99dff2c0a5ab2 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
index c747fc025c74c..058cc3afd4560 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store_struct.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store_struct.c
index 863189c5051eb..60c5701dec6f0 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store_struct.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store_struct.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
index 5e8007f952538..280768db52b71 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tblq.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
index ee44ff57cee03..8ac4ebf6a6057 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_tbxq.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uclamp.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uclamp.c
index 4c4ffeef38c18..4a23e9d3708dc 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uclamp.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uclamp.c
@@ -10,13 +10,13 @@
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sve \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme \
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_undef_bool.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_undef_bool.c
index c0e429307ba32..2c988382e2bef 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_undef_bool.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_undef_bool.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O2 -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O2 -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
index c059ee00fb8eb..979105d5be91e 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq1.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
index c7d2a0967d809..cfe295ea64d7f 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_uzpq2.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_pn.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_pn.c
index d02b8069d2a29..38a95a2ca7835 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_pn.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_pn.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_x2.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_x2.c
index bd485f15e490e..99bfdd1de5790 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_x2.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_while_x2.c
@@ -1,13 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
index 1d6f90fb9e514..1fc35a25c6e06 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq1.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
index b900507d43259..2efac16a43add 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_zipq2.c
@@ -1,20 +1,20 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -target-feature +sme2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 -target-feature +bf16\
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p1 \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
diff --git a/clang/test/CodeGen/PowerPC/check-zero-vector.c b/clang/test/CodeGen/PowerPC/check-zero-vector.c
new file mode 100644
index 0000000000000..cb6c826641366
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/check-zero-vector.c
@@ -0,0 +1,143 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1  -triple powerpc64-ibm-aix -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_64
+// RUN: %clang_cc1  -triple powerpc64le-unknown-linux-gnu -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_64LE
+// RUN: %clang_cc1  -triple powerpc-ibm-aix -emit-llvm %s -o -  | FileCheck %s --check-prefix=POWERPC_32
+
+// POWERPC_64-LABEL: define signext i32 @test_Greater_than(
+// POWERPC_64-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_64-NEXT:  [[ENTRY:.*:]]
+// POWERPC_64-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 8
+// POWERPC_64-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_64-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_64-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_64:       [[FOR_COND]]:
+// POWERPC_64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_64:       [[FOR_BODY]]:
+// POWERPC_64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// POWERPC_64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 [[IDXPROM]]
+// POWERPC_64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_64-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_64-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_64-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_64:       [[IF_THEN]]:
+// POWERPC_64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_64-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    br label %[[IF_END]]
+// POWERPC_64:       [[IF_END]]:
+// POWERPC_64-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_64:       [[FOR_INC]]:
+// POWERPC_64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_64-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_64-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_64:       [[FOR_END]]:
+// POWERPC_64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_64-NEXT:    ret i32 [[CONV4]]
+//
+// POWERPC_64LE-LABEL: define dso_local signext i32 @test_Greater_than(
+// POWERPC_64LE-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_64LE-NEXT:  [[ENTRY:.*:]]
+// POWERPC_64LE-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 8
+// POWERPC_64LE-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_64LE-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_64LE-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64LE-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_64LE:       [[FOR_COND]]:
+// POWERPC_64LE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_64LE-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_64LE:       [[FOR_BODY]]:
+// POWERPC_64LE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 8
+// POWERPC_64LE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP2]] to i64
+// POWERPC_64LE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 [[IDXPROM]]
+// POWERPC_64LE-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_64LE-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_64LE-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_64LE-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_64LE:       [[IF_THEN]]:
+// POWERPC_64LE-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_64LE-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    br label %[[IF_END]]
+// POWERPC_64LE:       [[IF_END]]:
+// POWERPC_64LE-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_64LE:       [[FOR_INC]]:
+// POWERPC_64LE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_64LE-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_64LE-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_64LE:       [[FOR_END]]:
+// POWERPC_64LE-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_64LE-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_64LE-NEXT:    ret i32 [[CONV4]]
+//
+// POWERPC_32-LABEL: define i32 @test_Greater_than(
+// POWERPC_32-SAME: ptr noundef [[COLAUTHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// POWERPC_32-NEXT:  [[ENTRY:.*:]]
+// POWERPC_32-NEXT:    [[COLAUTHS_ADDR:%.*]] = alloca ptr, align 4
+// POWERPC_32-NEXT:    [[RESULT:%.*]] = alloca i16, align 2
+// POWERPC_32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// POWERPC_32-NEXT:    store ptr [[COLAUTHS]], ptr [[COLAUTHS_ADDR]], align 4
+// POWERPC_32-NEXT:    store i16 0, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    store i32 0, ptr [[I]], align 4
+// POWERPC_32-NEXT:    br label %[[FOR_COND:.*]]
+// POWERPC_32:       [[FOR_COND]]:
+// POWERPC_32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
+// POWERPC_32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// POWERPC_32:       [[FOR_BODY]]:
+// POWERPC_32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[COLAUTHS_ADDR]], align 4
+// POWERPC_32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 [[TMP2]]
+// POWERPC_32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// POWERPC_32-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
+// POWERPC_32-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CONV]], 0
+// POWERPC_32-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// POWERPC_32:       [[IF_THEN]]:
+// POWERPC_32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    [[INC:%.*]] = add i16 [[TMP4]], 1
+// POWERPC_32-NEXT:    store i16 [[INC]], ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    br label %[[IF_END]]
+// POWERPC_32:       [[IF_END]]:
+// POWERPC_32-NEXT:    br label %[[FOR_INC:.*]]
+// POWERPC_32:       [[FOR_INC]]:
+// POWERPC_32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// POWERPC_32-NEXT:    [[INC3:%.*]] = add nsw i32 [[TMP5]], 1
+// POWERPC_32-NEXT:    store i32 [[INC3]], ptr [[I]], align 4
+// POWERPC_32-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// POWERPC_32:       [[FOR_END]]:
+// POWERPC_32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[RESULT]], align 2
+// POWERPC_32-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP6]] to i32
+// POWERPC_32-NEXT:    ret i32 [[CONV4]]
+//
+int test_Greater_than(unsigned short *colauths) {
+  unsigned short result = 0;
+  for (int i = 0; i < 4; i++) {
+    if (colauths[i] > 0) {
+      result++;
+    }
+  }
+  return result;
+}
+//.
+// POWERPC_64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_64: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
+// POWERPC_64LE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_64LE: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
+// POWERPC_32: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+// POWERPC_32: [[META3]] = !{!"llvm.loop.mustprogress"}
+//.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvt.c
index f4c2557855caf..f613ca1a16fc8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvt.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4(vint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4(vint8mf8_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2
 // CHECK-RV64-SAME: (<vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2(vint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2(vint8mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1
 // CHECK-RV64-SAME: (<vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1(vint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1(vint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2
 // CHECK-RV64-SAME: (<vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2(vint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2(vint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4
 // CHECK-RV64-SAME: (<vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4(vint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4(vint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8
 // CHECK-RV64-SAME: (<vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8(vint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8(vint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2(vint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2(vint16mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1(vint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1(vint16mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2(vint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2(vint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4(vint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4(vint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8(vint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8(vint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1(vint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1(vint32mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2(vint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2(vint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4(vint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4(vint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8(vint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8(vint32m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_m(vbool64_t mask, vint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_m(vbool64_t mask, vint8mf8_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_m(vbool32_t mask, vint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_m(vbool32_t mask, vint8mf4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_m(vbool16_t mask, vint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_m(vbool16_t mask, vint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_m(vbool8_t mask, vint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_m(vbool8_t mask, vint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_m(vbool4_t mask, vint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_m(vbool4_t mask, vint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_m
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_m(vbool2_t mask, vint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_m(vbool2_t mask, vint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_m(vbool64_t mask, vint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_m(vbool64_t mask, vint16mf4_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_m(vbool32_t mask, vint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_m(vbool32_t mask, vint16mf2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_m(vbool16_t mask, vint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_m(vbool16_t mask, vint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_m(vbool8_t mask, vint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_m(vbool8_t mask, vint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_m(vbool4_t mask, vint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_m(vbool4_t mask, vint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_m(vbool64_t mask, vint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_m(vbool64_t mask, vint32mf2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_m(vbool32_t mask, vint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_m(vbool32_t mask, vint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_m(vbool16_t mask, vint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_m(vbool16_t mask, vint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_m(vbool8_t mask, vint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvtu.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvtu.c
index 6026e80b5f600..ac1a7dd8b8ec0 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvtu.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vwcvtu.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4(vuint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4(vuint8mf8_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2
 // CHECK-RV64-SAME: (<vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2(vuint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2(vuint8mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1
 // CHECK-RV64-SAME: (<vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1(vuint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1(vuint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2
 // CHECK-RV64-SAME: (<vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2(vuint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2(vuint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4
 // CHECK-RV64-SAME: (<vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4(vuint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4(vuint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8
 // CHECK-RV64-SAME: (<vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8(vuint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8(vuint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2(vuint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2(vuint16mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1(vuint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1(vuint16mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2(vuint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2(vuint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4(vuint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4(vuint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8(vuint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8(vuint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1(vuint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1(vuint32mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2(vuint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2(vuint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4(vuint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4(vuint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8(vuint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8(vuint32m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_m(vbool64_t mask, vuint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_m(vbool64_t mask, vuint8mf8_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_m(vbool32_t mask, vuint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_m(vbool32_t mask, vuint8mf4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_m(vbool16_t mask, vuint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_m(vbool16_t mask, vuint8mf2_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_m(vbool8_t mask, vuint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_m(vbool8_t mask, vuint8m1_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_m(vbool4_t mask, vuint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_m(vbool4_t mask, vuint8m2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_m
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_m(vbool2_t mask, vuint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_m(vbool2_t mask, vuint8m4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_m(vbool64_t mask, vuint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_m(vbool64_t mask, vuint16mf4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_m(vbool32_t mask, vuint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_m(vbool32_t mask, vuint16mf2_t src, size_t v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_m(vbool16_t mask, vuint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_m(vbool16_t mask, vuint16m1_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_m(vbool8_t mask, vuint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_m(vbool8_t mask, vuint16m2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_m(vbool4_t mask, vuint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_m(vbool4_t mask, vuint16m4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_m(vbool64_t mask, vuint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_m(vbool64_t mask, vuint32mf2_t src, size_t v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_m(vbool32_t mask, vuint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_m(vbool32_t mask, vuint32m1_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_m(vbool16_t mask, vuint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_m(vbool16_t mask, vuint32m2_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_m(vbool8_t mask, vuint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvt.c
index b60cf83a4a780..63d8ccfa7d966 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvt.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4(vint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4(vint8mf8_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2
 // CHECK-RV64-SAME: (<vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2(vint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2(vint8mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1
 // CHECK-RV64-SAME: (<vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1(vint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1(vint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2
 // CHECK-RV64-SAME: (<vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2(vint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2(vint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4
 // CHECK-RV64-SAME: (<vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4(vint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4(vint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8
 // CHECK-RV64-SAME: (<vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8(vint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8(vint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2(vint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2(vint16mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1(vint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1(vint16mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2(vint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2(vint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4(vint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4(vint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8(vint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8(vint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1(vint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1(vint32mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2(vint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2(vint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4(vint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4(vint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8(vint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8(vint32m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_m(vbool64_t mask, vint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_m(vbool64_t mask, vint8mf8_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_m(vbool32_t mask, vint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_m(vbool32_t mask, vint8mf4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_m(vbool16_t mask, vint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_m(vbool16_t mask, vint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_m(vbool8_t mask, vint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_m(vbool8_t mask, vint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_m(vbool4_t mask, vint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_m(vbool4_t mask, vint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_m
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_m(vbool2_t mask, vint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_m(vbool2_t mask, vint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_m(vbool64_t mask, vint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_m(vbool64_t mask, vint16mf4_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_m(vbool32_t mask, vint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_m(vbool32_t mask, vint16mf2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_m(vbool16_t mask, vint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_m(vbool16_t mask, vint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_m(vbool8_t mask, vint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_m(vbool8_t mask, vint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_m(vbool4_t mask, vint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_m(vbool4_t mask, vint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_m(vbool64_t mask, vint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_m(vbool64_t mask, vint32mf2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_m(vbool32_t mask, vint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_m(vbool32_t mask, vint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_m(vbool16_t mask, vint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_m(vbool16_t mask, vint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_m(vbool8_t mask, vint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvtu.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvtu.c
index 555888d94980c..96e47f105be87 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvtu.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vwcvtu.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4
 // CHECK-RV64-SAME: (<vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4(vuint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4(vuint8mf8_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2
 // CHECK-RV64-SAME: (<vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2(vuint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2(vuint8mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1
 // CHECK-RV64-SAME: (<vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1(vuint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1(vuint8mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2
 // CHECK-RV64-SAME: (<vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2(vuint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2(vuint8m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4
 // CHECK-RV64-SAME: (<vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4(vuint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4(vuint8m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8
 // CHECK-RV64-SAME: (<vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8(vuint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8(vuint8m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2(vuint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2(vuint16mf4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1(vuint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1(vuint16mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2(vuint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2(vuint16m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4(vuint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4(vuint16m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8(vuint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8(vuint16m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1(vuint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1(vuint32mf2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2(vuint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2(vuint32m1_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4(vuint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4(vuint32m2_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8(vuint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8(vuint32m4_t src, size_t vl) {
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> poison, <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_m(vbool64_t mask, vuint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_m(vbool64_t mask, vuint8mf8_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> poison, <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_m(vbool32_t mask, vuint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_m(vbool32_t mask, vuint8mf4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> poison, <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_m(vbool16_t mask, vuint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_m(vbool16_t mask, vuint8mf2_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> poison, <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_m(vbool8_t mask, vuint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_m(vbool8_t mask, vuint8m1_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> poison, <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_m(vbool4_t mask, vuint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_m(vbool4_t mask, vuint8m2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_m
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> poison, <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_m(vbool2_t mask, vuint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_m(vbool2_t mask, vuint8m4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> poison, <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_m(vbool64_t mask, vuint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_m(vbool64_t mask, vuint16mf4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> poison, <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_m(vbool32_t mask, vuint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_m(vbool32_t mask, vuint16mf2_t src, size_t v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> poison, <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_m(vbool16_t mask, vuint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_m(vbool16_t mask, vuint16m1_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> poison, <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_m(vbool8_t mask, vuint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_m(vbool8_t mask, vuint16m2_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_m
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> poison, <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_m(vbool4_t mask, vuint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_m(vbool4_t mask, vuint16m4_t src, size_t vl)
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_m
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> poison, <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_m(vbool64_t mask, vuint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_m(vbool64_t mask, vuint32mf2_t src, size_t v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_m
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> poison, <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_m(vbool32_t mask, vuint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_m(vbool32_t mask, vuint32m1_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_m
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> poison, <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_m(vbool16_t mask, vuint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_m(vbool16_t mask, vuint32m2_t src, size_t vl
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_m
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> poison, <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_m(vbool8_t mask, vuint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvt.c
index 783c449e0289d..50066814e8606 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvt.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tu(vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tu(vint16mf4_t maskedoff, vint8mf8_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tu(vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tu(vint16mf2_t maskedoff, vint8mf4_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tu(vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tu(vint16m1_t maskedoff, vint8mf2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tu(vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tu(vint16m2_t maskedoff, vint8m1_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tu(vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tu(vint16m4_t maskedoff, vint8m2_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tu
 // CHECK-RV64-SAME: (<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tu(vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tu(vint16m8_t maskedoff, vint8m4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tu(vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tu(vint32mf2_t maskedoff, vint16mf4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tu(vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tu(vint32m1_t maskedoff, vint16mf2_t src, size
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tu(vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tu(vint32m2_t maskedoff, vint16m1_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tu(vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tu(vint32m4_t maskedoff, vint16m2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tu(vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tu(vint32m8_t maskedoff, vint16m4_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tu(vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tu(vint64m1_t maskedoff, vint32mf2_t src, size
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tu(vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tu(vint64m2_t maskedoff, vint32m1_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tu(vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tu(vint64m4_t maskedoff, vint32m2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tu(vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tu(vint64m8_t maskedoff, vint32m4_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tum(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tum(vbool64_t mask, vint16mf4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tum(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tum(vbool32_t mask, vint16mf2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tum(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tum(vbool16_t mask, vint16m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tum(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tum(vbool8_t mask, vint16m2_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tum(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tum(vbool4_t mask, vint16m4_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tum
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tum(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tum(vbool2_t mask, vint16m8_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tum(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tum(vbool64_t mask, vint32mf2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tum(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tum(vbool32_t mask, vint32m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tum(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tum(vbool16_t mask, vint32m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tum(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tum(vbool8_t mask, vint32m4_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tum(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tum(vbool4_t mask, vint32m8_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tum(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tum(vbool64_t mask, vint64m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tum(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tum(vbool32_t mask, vint64m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tum(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tum(vbool16_t mask, vint64m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tum(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -309,7 +309,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tum(vbool8_t mask, vint64m8_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tumu(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -319,7 +319,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tumu(vbool64_t mask, vint16mf4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tumu(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -329,7 +329,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tumu(vbool32_t mask, vint16mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tumu(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -339,7 +339,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tumu(vbool16_t mask, vint16m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tumu(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -349,7 +349,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tumu(vbool8_t mask, vint16m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tumu(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -359,7 +359,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tumu(vbool4_t mask, vint16m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tumu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tumu(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -369,7 +369,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tumu(vbool2_t mask, vint16m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tumu(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -379,7 +379,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tumu(vbool64_t mask, vint32mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tumu(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -389,7 +389,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tumu(vbool32_t mask, vint32m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tumu(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -399,7 +399,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tumu(vbool16_t mask, vint32m2_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tumu(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -409,7 +409,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tumu(vbool8_t mask, vint32m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tumu(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -419,7 +419,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tumu(vbool4_t mask, vint32m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tumu(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -429,7 +429,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tumu(vbool64_t mask, vint64m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tumu(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -439,7 +439,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tumu(vbool32_t mask, vint64m2_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tumu(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -449,7 +449,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tumu(vbool16_t mask, vint64m4_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tumu(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -459,7 +459,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tumu(vbool8_t mask, vint64m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_mu(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -469,7 +469,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_mu(vbool64_t mask, vint16mf4_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_mu(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -479,7 +479,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_mu(vbool32_t mask, vint16mf2_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_mu(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -489,7 +489,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_mu(vbool16_t mask, vint16m1_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_mu(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -499,7 +499,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_mu(vbool8_t mask, vint16m2_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_mu(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -509,7 +509,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_mu(vbool4_t mask, vint16m4_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_mu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_mu(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -519,7 +519,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_mu(vbool2_t mask, vint16m8_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_mu(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -529,7 +529,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_mu(vbool64_t mask, vint32mf2_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_mu(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -539,7 +539,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_mu(vbool32_t mask, vint32m1_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_mu(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -549,7 +549,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_mu(vbool16_t mask, vint32m2_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_mu(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -559,7 +559,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_mu(vbool8_t mask, vint32m4_t maskedoff, vint16
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_mu(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -569,7 +569,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_mu(vbool4_t mask, vint32m8_t maskedoff, vint16
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_mu(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -579,7 +579,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_mu(vbool64_t mask, vint64m1_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_mu(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -589,7 +589,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_mu(vbool32_t mask, vint64m2_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_mu(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -599,7 +599,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_mu(vbool16_t mask, vint64m4_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_mu(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvtu.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvtu.c
index 3858d5b3e2287..45dfce31bb90a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvtu.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vwcvtu.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint8mf8_t src
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint8mf4_t src
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tu(vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tu(vuint16m1_t maskedoff, vuint8mf2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tu(vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tu(vuint16m2_t maskedoff, vuint8m1_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tu(vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tu(vuint16m4_t maskedoff, vuint8m2_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tu
 // CHECK-RV64-SAME: (<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tu(vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tu(vuint16m8_t maskedoff, vuint8m4_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint16mf4_t sr
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tu(vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tu(vuint32m1_t maskedoff, vuint16mf2_t src,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tu(vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tu(vuint32m2_t maskedoff, vuint16m1_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tu(vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tu(vuint32m4_t maskedoff, vuint16m2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tu(vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tu(vuint32m8_t maskedoff, vuint16m4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tu(vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tu(vuint64m1_t maskedoff, vuint32mf2_t src,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tu(vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tu(vuint64m2_t maskedoff, vuint32m1_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tu(vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tu(vuint64m4_t maskedoff, vuint32m2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tu(vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tu(vuint64m8_t maskedoff, vuint32m4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tum
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -309,7 +309,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -319,7 +319,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -329,7 +329,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -339,7 +339,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -349,7 +349,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -359,7 +359,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tumu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -369,7 +369,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -379,7 +379,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -389,7 +389,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -399,7 +399,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -409,7 +409,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -419,7 +419,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -429,7 +429,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -439,7 +439,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -449,7 +449,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -459,7 +459,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -469,7 +469,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -479,7 +479,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -489,7 +489,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -499,7 +499,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -509,7 +509,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_mu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -519,7 +519,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -529,7 +529,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -539,7 +539,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -549,7 +549,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -559,7 +559,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -569,7 +569,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -579,7 +579,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -589,7 +589,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -599,7 +599,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvt.c
index 4376c5d9860e0..4e40521d59137 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvt.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvt.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tu(vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tu(vint16mf4_t maskedoff, vint8mf8_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tu(vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tu(vint16mf2_t maskedoff, vint8mf4_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tu(vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tu(vint16m1_t maskedoff, vint8mf2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tu(vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tu(vint16m2_t maskedoff, vint8m1_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tu(vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tu(vint16m4_t maskedoff, vint8m2_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tu
 // CHECK-RV64-SAME: (<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tu(vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tu(vint16m8_t maskedoff, vint8m4_t src, size_t
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tu(vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tu(vint32mf2_t maskedoff, vint16mf4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tu(vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tu(vint32m1_t maskedoff, vint16mf2_t src, size
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tu(vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tu(vint32m2_t maskedoff, vint16m1_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tu(vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tu(vint32m4_t maskedoff, vint16m2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tu(vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tu(vint32m8_t maskedoff, vint16m4_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tu(vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tu(vint64m1_t maskedoff, vint32mf2_t src, size
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tu(vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tu(vint64m2_t maskedoff, vint32m1_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tu(vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tu(vint64m4_t maskedoff, vint32m2_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tu(vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tu(vint64m8_t maskedoff, vint32m4_t src, size_
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tum(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tum(vbool64_t mask, vint16mf4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tum(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tum(vbool32_t mask, vint16mf2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tum(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tum(vbool16_t mask, vint16m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tum(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tum(vbool8_t mask, vint16m2_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tum(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tum(vbool4_t mask, vint16m4_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tum
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tum(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tum(vbool2_t mask, vint16m8_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tum(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tum(vbool64_t mask, vint32mf2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tum(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tum(vbool32_t mask, vint32m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tum(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tum(vbool16_t mask, vint32m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tum(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tum(vbool8_t mask, vint32m4_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tum(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tum(vbool4_t mask, vint32m8_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tum(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tum(vbool64_t mask, vint64m1_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tum(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tum(vbool32_t mask, vint64m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tum(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tum(vbool16_t mask, vint64m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tum(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -309,7 +309,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tum(vbool8_t mask, vint64m8_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_tumu(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -319,7 +319,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_tumu(vbool64_t mask, vint16mf4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_tumu(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -329,7 +329,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_tumu(vbool32_t mask, vint16mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_tumu(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -339,7 +339,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_tumu(vbool16_t mask, vint16m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_tumu(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -349,7 +349,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_tumu(vbool8_t mask, vint16m2_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_tumu(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -359,7 +359,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_tumu(vbool4_t mask, vint16m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_tumu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_tumu(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -369,7 +369,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_tumu(vbool2_t mask, vint16m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_tumu(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -379,7 +379,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_tumu(vbool64_t mask, vint32mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_tumu(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -389,7 +389,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_tumu(vbool32_t mask, vint32m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_tumu(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -399,7 +399,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_tumu(vbool16_t mask, vint32m2_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_tumu(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -409,7 +409,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_tumu(vbool8_t mask, vint32m4_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_tumu(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -419,7 +419,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_tumu(vbool4_t mask, vint32m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_tumu(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -429,7 +429,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_tumu(vbool64_t mask, vint64m1_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_tumu(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -439,7 +439,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_tumu(vbool32_t mask, vint64m2_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_tumu(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -449,7 +449,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_tumu(vbool16_t mask, vint64m4_t maskedoff, vin
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_tumu(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
@@ -459,7 +459,7 @@ vint64m8_t test_vwcvt_x_x_v_i64m8_tumu(vbool8_t mask, vint64m8_t maskedoff, vint
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvt_x_x_v_i16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwadd.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwcvt_x_x_v_i16mf4_mu(vbool64_t mask, vint16mf4_t maskedoff, vint8mf8_t src, size_t vl) {
@@ -469,7 +469,7 @@ vint16mf4_t test_vwcvt_x_x_v_i16mf4_mu(vbool64_t mask, vint16mf4_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvt_x_x_v_i16mf2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwadd.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwcvt_x_x_v_i16mf2_mu(vbool32_t mask, vint16mf2_t maskedoff, vint8mf4_t src, size_t vl) {
@@ -479,7 +479,7 @@ vint16mf2_t test_vwcvt_x_x_v_i16mf2_mu(vbool32_t mask, vint16mf2_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvt_x_x_v_i16m1_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwadd.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwcvt_x_x_v_i16m1_mu(vbool16_t mask, vint16m1_t maskedoff, vint8mf2_t src, size_t vl) {
@@ -489,7 +489,7 @@ vint16m1_t test_vwcvt_x_x_v_i16m1_mu(vbool16_t mask, vint16m1_t maskedoff, vint8
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvt_x_x_v_i16m2_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwadd.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwcvt_x_x_v_i16m2_mu(vbool8_t mask, vint16m2_t maskedoff, vint8m1_t src, size_t vl) {
@@ -499,7 +499,7 @@ vint16m2_t test_vwcvt_x_x_v_i16m2_mu(vbool8_t mask, vint16m2_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvt_x_x_v_i16m4_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwadd.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwcvt_x_x_v_i16m4_mu(vbool4_t mask, vint16m4_t maskedoff, vint8m2_t src, size_t vl) {
@@ -509,7 +509,7 @@ vint16m4_t test_vwcvt_x_x_v_i16m4_mu(vbool4_t mask, vint16m4_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvt_x_x_v_i16m8_mu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwadd.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwcvt_x_x_v_i16m8_mu(vbool2_t mask, vint16m8_t maskedoff, vint8m4_t src, size_t vl) {
@@ -519,7 +519,7 @@ vint16m8_t test_vwcvt_x_x_v_i16m8_mu(vbool2_t mask, vint16m8_t maskedoff, vint8m
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvt_x_x_v_i32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwadd.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwcvt_x_x_v_i32mf2_mu(vbool64_t mask, vint32mf2_t maskedoff, vint16mf4_t src, size_t vl) {
@@ -529,7 +529,7 @@ vint32mf2_t test_vwcvt_x_x_v_i32mf2_mu(vbool64_t mask, vint32mf2_t maskedoff, vi
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvt_x_x_v_i32m1_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwadd.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwcvt_x_x_v_i32m1_mu(vbool32_t mask, vint32m1_t maskedoff, vint16mf2_t src, size_t vl) {
@@ -539,7 +539,7 @@ vint32m1_t test_vwcvt_x_x_v_i32m1_mu(vbool32_t mask, vint32m1_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvt_x_x_v_i32m2_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwadd.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwcvt_x_x_v_i32m2_mu(vbool16_t mask, vint32m2_t maskedoff, vint16m1_t src, size_t vl) {
@@ -549,7 +549,7 @@ vint32m2_t test_vwcvt_x_x_v_i32m2_mu(vbool16_t mask, vint32m2_t maskedoff, vint1
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvt_x_x_v_i32m4_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwadd.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwcvt_x_x_v_i32m4_mu(vbool8_t mask, vint32m4_t maskedoff, vint16m2_t src, size_t vl) {
@@ -559,7 +559,7 @@ vint32m4_t test_vwcvt_x_x_v_i32m4_mu(vbool8_t mask, vint32m4_t maskedoff, vint16
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvt_x_x_v_i32m8_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwadd.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwcvt_x_x_v_i32m8_mu(vbool4_t mask, vint32m8_t maskedoff, vint16m4_t src, size_t vl) {
@@ -569,7 +569,7 @@ vint32m8_t test_vwcvt_x_x_v_i32m8_mu(vbool4_t mask, vint32m8_t maskedoff, vint16
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvt_x_x_v_i64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwadd.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwcvt_x_x_v_i64m1_mu(vbool64_t mask, vint64m1_t maskedoff, vint32mf2_t src, size_t vl) {
@@ -579,7 +579,7 @@ vint64m1_t test_vwcvt_x_x_v_i64m1_mu(vbool64_t mask, vint64m1_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvt_x_x_v_i64m2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwadd.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwcvt_x_x_v_i64m2_mu(vbool32_t mask, vint64m2_t maskedoff, vint32m1_t src, size_t vl) {
@@ -589,7 +589,7 @@ vint64m2_t test_vwcvt_x_x_v_i64m2_mu(vbool32_t mask, vint64m2_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvt_x_x_v_i64m4_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwadd.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwcvt_x_x_v_i64m4_mu(vbool16_t mask, vint64m4_t maskedoff, vint32m2_t src, size_t vl) {
@@ -599,7 +599,7 @@ vint64m4_t test_vwcvt_x_x_v_i64m4_mu(vbool16_t mask, vint64m4_t maskedoff, vint3
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvt_x_x_v_i64m8_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwadd.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwcvt_x_x_v_i64m8_mu(vbool8_t mask, vint64m8_t maskedoff, vint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvtu.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvtu.c
index a0175dfc4ddbb..006e51b499e24 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvtu.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vwcvtu.c
@@ -9,7 +9,7 @@
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -19,7 +19,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint8mf8_t src
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -29,7 +29,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint8mf4_t src
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tu(vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -39,7 +39,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tu(vuint16m1_t maskedoff, vuint8mf2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tu(vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -49,7 +49,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tu(vuint16m2_t maskedoff, vuint8m1_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tu(vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -59,7 +59,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tu(vuint16m4_t maskedoff, vuint8m2_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tu
 // CHECK-RV64-SAME: (<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tu(vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -69,7 +69,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tu(vuint16m8_t maskedoff, vuint8m4_t src, si
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -79,7 +79,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint16mf4_t sr
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tu(vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tu(vuint32m1_t maskedoff, vuint16mf2_t src,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tu(vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -99,7 +99,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tu(vuint32m2_t maskedoff, vuint16m1_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tu(vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -109,7 +109,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tu(vuint32m4_t maskedoff, vuint16m2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tu
 // CHECK-RV64-SAME: (<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tu(vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -119,7 +119,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tu(vuint32m8_t maskedoff, vuint16m4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tu
 // CHECK-RV64-SAME: (<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tu(vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -129,7 +129,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tu(vuint64m1_t maskedoff, vuint32mf2_t src,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tu
 // CHECK-RV64-SAME: (<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tu(vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -139,7 +139,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tu(vuint64m2_t maskedoff, vuint32m1_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tu
 // CHECK-RV64-SAME: (<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tu(vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -149,7 +149,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tu(vuint64m4_t maskedoff, vuint32m2_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tu
 // CHECK-RV64-SAME: (<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, i64 [[VL]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tu(vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -159,7 +159,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tu(vuint64m8_t maskedoff, vuint32m4_t src, s
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -169,7 +169,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -179,7 +179,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -189,7 +189,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -199,7 +199,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -209,7 +209,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tum
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -219,7 +219,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -229,7 +229,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -239,7 +239,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -249,7 +249,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -259,7 +259,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tum
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tum
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -279,7 +279,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tum
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -289,7 +289,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tum
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -299,7 +299,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tum
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -309,7 +309,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -319,7 +319,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -329,7 +329,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -339,7 +339,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -349,7 +349,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -359,7 +359,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_tumu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -369,7 +369,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -379,7 +379,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedof
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -389,7 +389,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -399,7 +399,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -409,7 +409,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_tumu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -419,7 +419,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_tumu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -429,7 +429,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_tumu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -439,7 +439,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_tumu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -449,7 +449,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_tumu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
@@ -459,7 +459,7 @@ vuint64m8_t test_vwcvtu_x_x_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, v
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vwcvtu_x_x_v_u16mf4_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i16.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwaddu.mask.nxv1i16.nxv1i8.i8.i64(<vscale x 1 x i16> [[MASKEDOFF]], <vscale x 1 x i8> [[SRC]], i8 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint8mf8_t src, size_t vl) {
@@ -469,7 +469,7 @@ vuint16mf4_t test_vwcvtu_x_x_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vwcvtu_x_x_v_u16mf2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i16.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwaddu.mask.nxv2i16.nxv2i8.i8.i64(<vscale x 2 x i16> [[MASKEDOFF]], <vscale x 2 x i8> [[SRC]], i8 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint8mf4_t src, size_t vl) {
@@ -479,7 +479,7 @@ vuint16mf2_t test_vwcvtu_x_x_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vwcvtu_x_x_v_u16m1_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i16.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwaddu.mask.nxv4i16.nxv4i8.i8.i64(<vscale x 4 x i16> [[MASKEDOFF]], <vscale x 4 x i8> [[SRC]], i8 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwcvtu_x_x_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint8mf2_t src, size_t vl) {
@@ -489,7 +489,7 @@ vuint16m1_t test_vwcvtu_x_x_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vwcvtu_x_x_v_u16m2_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i16.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwaddu.mask.nxv8i16.nxv8i8.i8.i64(<vscale x 8 x i16> [[MASKEDOFF]], <vscale x 8 x i8> [[SRC]], i8 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwcvtu_x_x_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint8m1_t src, size_t vl) {
@@ -499,7 +499,7 @@ vuint16m2_t test_vwcvtu_x_x_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vwcvtu_x_x_v_u16m4_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i16.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwaddu.mask.nxv16i16.nxv16i8.i8.i64(<vscale x 16 x i16> [[MASKEDOFF]], <vscale x 16 x i8> [[SRC]], i8 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwcvtu_x_x_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint8m2_t src, size_t vl) {
@@ -509,7 +509,7 @@ vuint16m4_t test_vwcvtu_x_x_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vwcvtu_x_x_v_u16m8_mu
 // CHECK-RV64-SAME: (<vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i16.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i16 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwaddu.mask.nxv32i16.nxv32i8.i8.i64(<vscale x 32 x i16> [[MASKEDOFF]], <vscale x 32 x i8> [[SRC]], i8 0, <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwcvtu_x_x_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint8m4_t src, size_t vl) {
@@ -519,7 +519,7 @@ vuint16m8_t test_vwcvtu_x_x_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vwcvtu_x_x_v_u32mf2_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i32.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwaddu.mask.nxv1i32.nxv1i16.i16.i64(<vscale x 1 x i32> [[MASKEDOFF]], <vscale x 1 x i16> [[SRC]], i16 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint16mf4_t src, size_t vl) {
@@ -529,7 +529,7 @@ vuint32mf2_t test_vwcvtu_x_x_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff,
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vwcvtu_x_x_v_u32m1_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i32.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwaddu.mask.nxv2i32.nxv2i16.i16.i64(<vscale x 2 x i32> [[MASKEDOFF]], <vscale x 2 x i16> [[SRC]], i16 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwcvtu_x_x_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint16mf2_t src, size_t vl) {
@@ -539,7 +539,7 @@ vuint32m1_t test_vwcvtu_x_x_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vwcvtu_x_x_v_u32m2_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i32.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwaddu.mask.nxv4i32.nxv4i16.i16.i64(<vscale x 4 x i32> [[MASKEDOFF]], <vscale x 4 x i16> [[SRC]], i16 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwcvtu_x_x_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint16m1_t src, size_t vl) {
@@ -549,7 +549,7 @@ vuint32m2_t test_vwcvtu_x_x_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vwcvtu_x_x_v_u32m4_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i32.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwaddu.mask.nxv8i32.nxv8i16.i16.i64(<vscale x 8 x i32> [[MASKEDOFF]], <vscale x 8 x i16> [[SRC]], i16 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwcvtu_x_x_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint16m2_t src, size_t vl) {
@@ -559,7 +559,7 @@ vuint32m4_t test_vwcvtu_x_x_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vwcvtu_x_x_v_u32m8_mu
 // CHECK-RV64-SAME: (<vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i32.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i32 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwaddu.mask.nxv16i32.nxv16i16.i16.i64(<vscale x 16 x i32> [[MASKEDOFF]], <vscale x 16 x i16> [[SRC]], i16 0, <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwcvtu_x_x_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint16m4_t src, size_t vl) {
@@ -569,7 +569,7 @@ vuint32m8_t test_vwcvtu_x_x_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vui
 // CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vwcvtu_x_x_v_u64m1_mu
 // CHECK-RV64-SAME: (<vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i64.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i64 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwaddu.mask.nxv1i64.nxv1i32.i32.i64(<vscale x 1 x i64> [[MASKEDOFF]], <vscale x 1 x i32> [[SRC]], i32 0, <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwcvtu_x_x_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint32mf2_t src, size_t vl) {
@@ -579,7 +579,7 @@ vuint64m1_t test_vwcvtu_x_x_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vwcvtu_x_x_v_u64m2_mu
 // CHECK-RV64-SAME: (<vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i64.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i64 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwaddu.mask.nxv2i64.nxv2i32.i32.i64(<vscale x 2 x i64> [[MASKEDOFF]], <vscale x 2 x i32> [[SRC]], i32 0, <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwcvtu_x_x_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint32m1_t src, size_t vl) {
@@ -589,7 +589,7 @@ vuint64m2_t test_vwcvtu_x_x_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vwcvtu_x_x_v_u64m4_mu
 // CHECK-RV64-SAME: (<vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i64.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i64 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwaddu.mask.nxv4i64.nxv4i32.i32.i64(<vscale x 4 x i64> [[MASKEDOFF]], <vscale x 4 x i32> [[SRC]], i32 0, <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwcvtu_x_x_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint32m2_t src, size_t vl) {
@@ -599,7 +599,7 @@ vuint64m4_t test_vwcvtu_x_x_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vu
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vwcvtu_x_x_v_u64m8_mu
 // CHECK-RV64-SAME: (<vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i64.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i64 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwaddu.mask.nxv8i64.nxv8i32.i32.i64(<vscale x 8 x i64> [[MASKEDOFF]], <vscale x 8 x i32> [[SRC]], i32 0, <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwcvtu_x_x_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint32m4_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei32.c
new file mode 100644
index 0000000000000..7bf0a4e5b7b1f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4(const __bf16 *rs1, vuint32mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2(const __bf16 *rs1, vuint32m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1(const __bf16 *rs1, vuint32m2_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2(const __bf16 *rs1, vuint32m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4(const __bf16 *rs1, vuint32m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32_v_bf16m4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                       vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei64.c
new file mode 100644
index 0000000000000..be42373070f9d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4(const __bf16 *rs1, vuint64m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2(const __bf16 *rs1, vuint64m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1(const __bf16 *rs1, vuint64m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei64_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2(const __bf16 *rs1, vuint64m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei64_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei8.c
new file mode 100644
index 0000000000000..767405acfde03
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4(const __bf16 *rs1, vuint8mf8_t rs2,
+                                      size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2(const __bf16 *rs1, vuint8mf4_t rs2,
+                                      size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1(const __bf16 *rs1, vuint8mf2_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2(const __bf16 *rs1, vuint8m1_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4(const __bf16 *rs1, vuint8m2_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8_v_bf16m4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8(const __bf16 *rs1, vuint8m4_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8_v_bf16m8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                        vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                        vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                      vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                      vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                      vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_m(vbool2_t vm, const __bf16 *rs1,
+                                      vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei32.c
new file mode 100644
index 0000000000000..adafe97dff8b8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei32.c
@@ -0,0 +1,120 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2(const __bf16 *rs1, vuint32m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                               vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei64.c
new file mode 100644
index 0000000000000..0be1d3fb19ae8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei8.c
new file mode 100644
index 0000000000000..8a6d93d429dff
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2(const __bf16 *rs1, vuint8m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                              vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei32.c
new file mode 100644
index 0000000000000..00d0043a5e81b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei64.c
new file mode 100644
index 0000000000000..0685d0cab692d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei8.c
new file mode 100644
index 0000000000000..b68c5f56a65b8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei32.c
new file mode 100644
index 0000000000000..0f65998c8f30b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei64.c
new file mode 100644
index 0000000000000..9c120fc68fa82
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei8.c
new file mode 100644
index 0000000000000..f7011650f9ed5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei32.c
new file mode 100644
index 0000000000000..c100047f765c0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei64.c
new file mode 100644
index 0000000000000..75342b035c3a2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei8.c
new file mode 100644
index 0000000000000..0e0b2ab502669
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei32.c
new file mode 100644
index 0000000000000..5d686a64b3ff6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei64.c
new file mode 100644
index 0000000000000..180e0e3946c57
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei8.c
new file mode 100644
index 0000000000000..cb25709918f66
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei32.c
new file mode 100644
index 0000000000000..a9c095a02ee80
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei64.c
new file mode 100644
index 0000000000000..89d8f75f74b0b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei8.c
new file mode 100644
index 0000000000000..e4aeaadd629c0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei32.c
new file mode 100644
index 0000000000000..552880c5d24b7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei64.c
new file mode 100644
index 0000000000000..f57756c4afb3c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei8.c
new file mode 100644
index 0000000000000..6ae89de151b40
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vloxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei32.c
new file mode 100644
index 0000000000000..44a77b981ceb6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4(const __bf16 *rs1, vuint32mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2(const __bf16 *rs1, vuint32m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1(const __bf16 *rs1, vuint32m2_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2(const __bf16 *rs1, vuint32m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4(const __bf16 *rs1, vuint32m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32_v_bf16m4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                       vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei64.c
new file mode 100644
index 0000000000000..67645b8ba5cb7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4(const __bf16 *rs1, vuint64m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2(const __bf16 *rs1, vuint64m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1(const __bf16 *rs1, vuint64m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei64_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2(const __bf16 *rs1, vuint64m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei64_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei8.c
new file mode 100644
index 0000000000000..f7ac2be80e08d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4(const __bf16 *rs1, vuint8mf8_t rs2,
+                                      size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2(const __bf16 *rs1, vuint8mf4_t rs2,
+                                      size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1(const __bf16 *rs1, vuint8mf2_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8_v_bf16m1(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2(const __bf16 *rs1, vuint8m1_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8_v_bf16m2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4(const __bf16 *rs1, vuint8m2_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8_v_bf16m4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8(const __bf16 *rs1, vuint8m4_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8_v_bf16m8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                        vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                        vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                      vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m1_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                      vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                      vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_m(vbool2_t vm, const __bf16 *rs1,
+                                      vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei32.c
new file mode 100644
index 0000000000000..e570c09397e3d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei32.c
@@ -0,0 +1,120 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2(const __bf16 *rs1, vuint32m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                               vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei64.c
new file mode 100644
index 0000000000000..cc806af77d848
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei8.c
new file mode 100644
index 0000000000000..871567122ce16
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2(const __bf16 *rs1, vuint8m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                              vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei32.c
new file mode 100644
index 0000000000000..770d7f2c255f8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei64.c
new file mode 100644
index 0000000000000..505b2096aaf13
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei8.c
new file mode 100644
index 0000000000000..708f28667f8ab
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei32.c
new file mode 100644
index 0000000000000..7c300c57ad9ae
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei64.c
new file mode 100644
index 0000000000000..db177f38000ed
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei8.c
new file mode 100644
index 0000000000000..fa52ffcb0a53f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei32.c
new file mode 100644
index 0000000000000..393cc317cef38
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei64.c
new file mode 100644
index 0000000000000..8048dbed774bf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei8.c
new file mode 100644
index 0000000000000..8fc02319cfba5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei32.c
new file mode 100644
index 0000000000000..a5f680c8b7662
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei64.c
new file mode 100644
index 0000000000000..50a5933e228ac
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei8.c
new file mode 100644
index 0000000000000..8684080d2d362
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei32.c
new file mode 100644
index 0000000000000..6bdcf10de0d34
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei64.c
new file mode 100644
index 0000000000000..7cdd26a6aa481
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei8.c
new file mode 100644
index 0000000000000..6ae6bd3e631d8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei32.c
new file mode 100644
index 0000000000000..bbcae0625be42
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei64.c
new file mode 100644
index 0000000000000..42b3365d1116f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei8.c
new file mode 100644
index 0000000000000..a50bb4cedc6d6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vluxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8_m(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8_m(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei32.c
new file mode 100644
index 0000000000000..775a28a4fe1fd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf4(__bf16 *rs1, vuint32mf2_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei32_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf2(__bf16 *rs1, vuint32m1_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei32_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m1(__bf16 *rs1, vuint32m2_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m2(__bf16 *rs1, vuint32m4_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m4(__bf16 *rs1, vuint32m8_t rs2, vbfloat16m4_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32_v_bf16m4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint32mf2_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei32_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei32_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei32_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei32_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t rs2,
+                              vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsoxei32_v_bf16m4_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei64.c
new file mode 100644
index 0000000000000..d69aa335b05b5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf4(__bf16 *rs1, vuint64m1_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei64_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf2(__bf16 *rs1, vuint64m2_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei64_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m1(__bf16 *rs1, vuint64m4_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei64_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m2(__bf16 *rs1, vuint64m8_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei64_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei64_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei64_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei64_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei64_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei8.c
new file mode 100644
index 0000000000000..e0e8376297555
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf4(__bf16 *rs1, vuint8mf8_t rs2, vbfloat16mf4_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei8_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf2(__bf16 *rs1, vuint8mf4_t rs2, vbfloat16mf2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei8_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m1(__bf16 *rs1, vuint8mf2_t rs2, vbfloat16m1_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m2(__bf16 *rs1, vuint8m1_t rs2, vbfloat16m2_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m4(__bf16 *rs1, vuint8m2_t rs2, vbfloat16m4_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8_v_bf16m4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m8(__bf16 *rs1, vuint8m4_t rs2, vbfloat16m8_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8_v_bf16m8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t rs2,
+                              vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t rs2,
+                              vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t rs2,
+                             vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t rs2,
+                             vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t rs2,
+                             vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16m4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m8_m(vbool2_t vm, __bf16 *rs1, vuint8m4_t rs2,
+                             vbfloat16m8_t vs3, size_t vl) {
+  return __riscv_vsoxei8_v_bf16m8_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei32.c
new file mode 100644
index 0000000000000..560d303933a90
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei32.c
@@ -0,0 +1,119 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf4x2(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf2x2(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m1x2(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m2x2(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m4x2(__bf16 *rs1, vuint32m8_t vs2,
+                                  vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x2_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t vs2,
+                                    vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32_v_bf16m4x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei64.c
new file mode 100644
index 0000000000000..41debc8613635
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf4x2(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf2x2(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m1x2(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m2x2(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei8.c
new file mode 100644
index 0000000000000..55b3ff48537a4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf4x2(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf2x2(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m1x2(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m2x2(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m4x2(__bf16 *rs1, vuint8m2_t vs2,
+                                 vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t vs2,
+                                   vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8_v_bf16m4x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei32.c
new file mode 100644
index 0000000000000..9da13af17a33f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf4x3(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf2x3(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m1x3(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m2x3(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x3_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei64.c
new file mode 100644
index 0000000000000..f8e974cff66cd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf4x3(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf2x3(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m1x3(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m2x3(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei8.c
new file mode 100644
index 0000000000000..d80f01903e5c0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf4x3(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf2x3(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m1x3(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m2x3(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei32.c
new file mode 100644
index 0000000000000..dec74165c9e87
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf4x4(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf2x4(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m1x4(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m2x4(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x4_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei64.c
new file mode 100644
index 0000000000000..9b042b97ba547
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf4x4(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf2x4(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m1x4(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m2x4(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei8.c
new file mode 100644
index 0000000000000..5c1ec6cbfd3ff
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf4x4(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf2x4(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m1x4(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m2x4(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei32.c
new file mode 100644
index 0000000000000..e278c9da207ec
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf4x5(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf2x5(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16m1x5(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x5_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei64.c
new file mode 100644
index 0000000000000..1439ab40b5be8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf4x5(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf2x5(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16m1x5(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei8.c
new file mode 100644
index 0000000000000..1b72716f25088
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf4x5(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf2x5(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16m1x5(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei32.c
new file mode 100644
index 0000000000000..7c659d353fdc4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf4x6(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf2x6(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16m1x6(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x6_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei64.c
new file mode 100644
index 0000000000000..b15b31e58fd01
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf4x6(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf2x6(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16m1x6(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei8.c
new file mode 100644
index 0000000000000..a18dc0cdc31cf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf4x6(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf2x6(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16m1x6(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei32.c
new file mode 100644
index 0000000000000..6e41b5491682e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf4x7(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf2x7(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16m1x7(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x7_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei64.c
new file mode 100644
index 0000000000000..ee8cedf02225f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf4x7(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf2x7(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16m1x7(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei8.c
new file mode 100644
index 0000000000000..ed07ca1803b75
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf4x7(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf2x7(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16m1x7(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei32.c
new file mode 100644
index 0000000000000..c2af8bdb067dc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf4x8(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf2x8(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16m1x8(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x8_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei64.c
new file mode 100644
index 0000000000000..ec8bc85ac4f86
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf4x8(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf2x8(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16m1x8(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei8.c
new file mode 100644
index 0000000000000..5ecd7ff291b8c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsoxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf4x8(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf2x8(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16m1x8(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei32.c
new file mode 100644
index 0000000000000..af8509247cf26
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf4(__bf16 *rs1, vuint32mf2_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei32_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf2(__bf16 *rs1, vuint32m1_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei32_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m1(__bf16 *rs1, vuint32m2_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m2(__bf16 *rs1, vuint32m4_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m4(__bf16 *rs1, vuint32m8_t rs2, vbfloat16m4_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32_v_bf16m4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint32mf2_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei32_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei32_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei32_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei32_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t rs2,
+                              vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsuxei32_v_bf16m4_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei64.c
new file mode 100644
index 0000000000000..d230cd6b4b757
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf4(__bf16 *rs1, vuint64m1_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei64_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf2(__bf16 *rs1, vuint64m2_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei64_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m1(__bf16 *rs1, vuint64m4_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei64_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m2(__bf16 *rs1, vuint64m8_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei64_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei64_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei64_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei64_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei64_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei8.c
new file mode 100644
index 0000000000000..6c91f5feb6d16
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf4(__bf16 *rs1, vuint8mf8_t rs2, vbfloat16mf4_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei8_v_bf16mf4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf2(__bf16 *rs1, vuint8mf4_t rs2, vbfloat16mf2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei8_v_bf16mf2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m1(__bf16 *rs1, vuint8mf2_t rs2, vbfloat16m1_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8_v_bf16m1(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m2(__bf16 *rs1, vuint8m1_t rs2, vbfloat16m2_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8_v_bf16m2(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m4(__bf16 *rs1, vuint8m2_t rs2, vbfloat16m4_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8_v_bf16m4(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m8(__bf16 *rs1, vuint8m4_t rs2, vbfloat16m8_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8_v_bf16m8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t rs2,
+                              vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16mf4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t rs2,
+                              vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16mf2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t rs2,
+                             vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16m1_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t rs2,
+                             vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16m2_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t rs2,
+                             vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16m4_m(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m8_m(vbool2_t vm, __bf16 *rs1, vuint8m4_t rs2,
+                             vbfloat16m8_t vs3, size_t vl) {
+  return __riscv_vsuxei8_v_bf16m8_m(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei32.c
new file mode 100644
index 0000000000000..95e346cdd4c63
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei32.c
@@ -0,0 +1,119 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf4x2(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf2x2(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m1x2(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m2x2(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m4x2(__bf16 *rs1, vuint32m8_t vs2,
+                                  vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x2_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t vs2,
+                                    vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32_v_bf16m4x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei64.c
new file mode 100644
index 0000000000000..9001cb1e23da0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf4x2(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf2x2(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m1x2(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m2x2(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei8.c
new file mode 100644
index 0000000000000..3c5490439282c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf4x2(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16mf4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf2x2(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16mf2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m1x2(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m1x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m2x2(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m2x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m4x2(__bf16 *rs1, vuint8m2_t vs2,
+                                 vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m4x2(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16mf4x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16mf2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m1x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m2x2_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t vs2,
+                                   vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8_v_bf16m4x2_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei32.c
new file mode 100644
index 0000000000000..8fd5dd337ea5b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf4x3(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf2x3(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m1x3(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m2x3(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x3_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei64.c
new file mode 100644
index 0000000000000..6ebaff9a8deef
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf4x3(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf2x3(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m1x3(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m2x3(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei8.c
new file mode 100644
index 0000000000000..79ffd53eaaa68
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf4x3(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16mf4x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf2x3(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16mf2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m1x3(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16m1x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m2x3(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16m2x3(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16mf4x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16mf2x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16m1x3_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8_v_bf16m2x3_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei32.c
new file mode 100644
index 0000000000000..e19555ab1e0d6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf4x4(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf2x4(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m1x4(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m2x4(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x4_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei64.c
new file mode 100644
index 0000000000000..6f89441c69faf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf4x4(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf2x4(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m1x4(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m2x4(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei8.c
new file mode 100644
index 0000000000000..1a4d4523b2233
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf4x4(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16mf4x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf2x4(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16mf2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m1x4(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16m1x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m2x4(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16m2x4(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16mf4x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16mf2x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16m1x4_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8_v_bf16m2x4_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei32.c
new file mode 100644
index 0000000000000..3a96d09995da9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf4x5(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf2x5(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16m1x5(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x5_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei64.c
new file mode 100644
index 0000000000000..2d34ab4e7a36e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf4x5(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf2x5(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16m1x5(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei8.c
new file mode 100644
index 0000000000000..f6f7b4ae6b96e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf4x5(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16mf4x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf2x5(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16mf2x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16m1x5(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16m1x5(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16mf4x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16mf2x5_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8_v_bf16m1x5_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei32.c
new file mode 100644
index 0000000000000..7db63cb97d512
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf4x6(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf2x6(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16m1x6(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x6_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei64.c
new file mode 100644
index 0000000000000..dd6c263688e7e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf4x6(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf2x6(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16m1x6(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei8.c
new file mode 100644
index 0000000000000..157eba825dc2d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf4x6(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16mf4x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf2x6(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16mf2x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16m1x6(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16m1x6(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16mf4x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16mf2x6_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8_v_bf16m1x6_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei32.c
new file mode 100644
index 0000000000000..8e48179147053
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf4x7(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf2x7(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16m1x7(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x7_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei64.c
new file mode 100644
index 0000000000000..6c9a0443f425d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf4x7(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf2x7(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16m1x7(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei8.c
new file mode 100644
index 0000000000000..27ced38cf8407
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf4x7(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16mf4x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf2x7(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16mf2x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16m1x7(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16m1x7(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16mf4x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16mf2x7_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8_v_bf16m1x7_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei32.c
new file mode 100644
index 0000000000000..81adc3cb6ba5e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf4x8(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf2x8(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16m1x8(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x8_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei64.c
new file mode 100644
index 0000000000000..43d76fc2dfd74
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf4x8(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf2x8(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16m1x8(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei8.c
new file mode 100644
index 0000000000000..3976ba816dbeb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vsuxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf4x8(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16mf4x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf2x8(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16mf2x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16m1x8(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16m1x8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16mf4x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16mf2x8_m(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8_v_bf16m1x8_m(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei32.c
new file mode 100644
index 0000000000000..dbf7941678b33
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4(const __bf16 *rs1, vuint32mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2(const __bf16 *rs1, vuint32m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1(const __bf16 *rs1, vuint32m2_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2(const __bf16 *rs1, vuint32m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4(const __bf16 *rs1, vuint32m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                       vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei64.c
new file mode 100644
index 0000000000000..9a35316065afd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4(const __bf16 *rs1, vuint64m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2(const __bf16 *rs1, vuint64m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1(const __bf16 *rs1, vuint64m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2(const __bf16 *rs1, vuint64m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vloxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei8.c
new file mode 100644
index 0000000000000..e3a6c813660b3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4(const __bf16 *rs1, vuint8mf8_t rs2,
+                                      size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2(const __bf16 *rs1, vuint8mf4_t rs2,
+                                      size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1(const __bf16 *rs1, vuint8mf2_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2(const __bf16 *rs1, vuint8m1_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4(const __bf16 *rs1, vuint8m2_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8(const __bf16 *rs1, vuint8m4_t rs2,
+                                    size_t vl) {
+  return __riscv_vloxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                        vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                        vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                      vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                      vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                      vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_m(vbool2_t vm, const __bf16 *rs1,
+                                      vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei32.c
new file mode 100644
index 0000000000000..7853a29bcfb11
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei32.c
@@ -0,0 +1,120 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2(const __bf16 *rs1, vuint32m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                               vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei64.c
new file mode 100644
index 0000000000000..0a68ccc33290c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei8.c
new file mode 100644
index 0000000000000..05b59f3b3e259
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2(const __bf16 *rs1, vuint8m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                              vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei32.c
new file mode 100644
index 0000000000000..683e30c9a6692
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei64.c
new file mode 100644
index 0000000000000..69b2809272eba
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei8.c
new file mode 100644
index 0000000000000..c6f00fe76ba26
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei32.c
new file mode 100644
index 0000000000000..55a980a6f67d9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei64.c
new file mode 100644
index 0000000000000..75a0200476176
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei8.c
new file mode 100644
index 0000000000000..f0c75373fc92c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei32.c
new file mode 100644
index 0000000000000..132e27cd557c4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei64.c
new file mode 100644
index 0000000000000..ce1cfeddba1ca
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei8.c
new file mode 100644
index 0000000000000..55b835f3a5421
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei32.c
new file mode 100644
index 0000000000000..c8df8d0c6b907
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei64.c
new file mode 100644
index 0000000000000..217b89ce831d1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei8.c
new file mode 100644
index 0000000000000..4684ebad695a1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei32.c
new file mode 100644
index 0000000000000..d9e2e9b41fed2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei64.c
new file mode 100644
index 0000000000000..461eebb092c54
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei8.c
new file mode 100644
index 0000000000000..71da2628c9fd6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei32.c
new file mode 100644
index 0000000000000..7848efaecaec1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei64.c
new file mode 100644
index 0000000000000..4e3e388d260b8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vloxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei8.c
new file mode 100644
index 0000000000000..399a03afa0c0e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vloxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei32.c
new file mode 100644
index 0000000000000..20bcca1a3fd38
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4(const __bf16 *rs1, vuint32mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2(const __bf16 *rs1, vuint32m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1(const __bf16 *rs1, vuint32m2_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2(const __bf16 *rs1, vuint32m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4(const __bf16 *rs1, vuint32m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                       vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei64.c
new file mode 100644
index 0000000000000..62012fd029307
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4(const __bf16 *rs1, vuint64m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2(const __bf16 *rs1, vuint64m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1(const __bf16 *rs1, vuint64m4_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2(const __bf16 *rs1, vuint64m8_t rs2,
+                                     size_t vl) {
+  return __riscv_vluxei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                         vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                         vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                       vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                       vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei8.c
new file mode 100644
index 0000000000000..c4bf30d7319c5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4(const __bf16 *rs1, vuint8mf8_t rs2,
+                                      size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2(const __bf16 *rs1, vuint8mf4_t rs2,
+                                      size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1(const __bf16 *rs1, vuint8mf2_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2(const __bf16 *rs1, vuint8m1_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4(const __bf16 *rs1, vuint8m2_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8(const __bf16 *rs1, vuint8m4_t rs2,
+                                    size_t vl) {
+  return __riscv_vluxei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_m(vbool64_t vm, const __bf16 *rs1,
+                                        vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_m(vbool32_t vm, const __bf16 *rs1,
+                                        vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_m(vbool16_t vm, const __bf16 *rs1,
+                                      vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_m(vbool8_t vm, const __bf16 *rs1,
+                                      vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_m(vbool4_t vm, const __bf16 *rs1,
+                                      vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> poison, ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_m(vbool2_t vm, const __bf16 *rs1,
+                                      vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei32.c
new file mode 100644
index 0000000000000..85934361676dc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei32.c
@@ -0,0 +1,120 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2(const __bf16 *rs1, vuint32m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                               vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei64.c
new file mode 100644
index 0000000000000..7def329db7be3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg2ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei8.c
new file mode 100644
index 0000000000000..bc6f1ffab972c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2(const __bf16 *rs1, vuint8m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg2ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_m(vbool4_t vm, const __bf16 *rs1,
+                                              vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei32.c
new file mode 100644
index 0000000000000..8a700b75b4299
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei64.c
new file mode 100644
index 0000000000000..d1d8943310e0b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg3ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei8.c
new file mode 100644
index 0000000000000..4952e0230a8ea
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg3ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei32.c
new file mode 100644
index 0000000000000..d853b86e3dda8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei32.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4(const __bf16 *rs1, vuint32m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei64.c
new file mode 100644
index 0000000000000..97dbdd705fc9f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei64.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4(const __bf16 *rs1, vuint64m8_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg4ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                               vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei8.c
new file mode 100644
index 0000000000000..db435cf18daa4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4(const __bf16 *rs1, vuint8m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg4ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_m(vbool8_t vm, const __bf16 *rs1,
+                                              vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei32.c
new file mode 100644
index 0000000000000..3719e841581d1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg5ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei64.c
new file mode 100644
index 0000000000000..66743a528fd52
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg5ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei8.c
new file mode 100644
index 0000000000000..f97f5b7597170
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg5ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei32.c
new file mode 100644
index 0000000000000..6d760e51245c1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg6ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei64.c
new file mode 100644
index 0000000000000..3b2ffacfcda62
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg6ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei8.c
new file mode 100644
index 0000000000000..1fef58ed84851
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg6ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei32.c
new file mode 100644
index 0000000000000..14d7ca9f34800
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg7ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei64.c
new file mode 100644
index 0000000000000..50d06a0f4b856
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg7ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei8.c
new file mode 100644
index 0000000000000..cdd05faeeded6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg7ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei32.c
new file mode 100644
index 0000000000000..8929f3348ba59
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei32.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8(const __bf16 *rs1, vuint32m2_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg8ei32(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei64.c
new file mode 100644
index 0000000000000..ece3490dfcc37
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei64.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8(const __bf16 *rs1,
+                                               vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8(const __bf16 *rs1,
+                                               vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8(const __bf16 *rs1, vuint64m4_t rs2,
+                                             size_t vl) {
+  return __riscv_vluxseg8ei64(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_m(vbool64_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_m(vbool32_t vm,
+                                                 const __bf16 *rs1,
+                                                 vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                               vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei8.c
new file mode 100644
index 0000000000000..9ee1ce5f28820
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vluxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8(const __bf16 *rs1,
+                                              vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8(const __bf16 *rs1,
+                                              vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8(const __bf16 *rs1, vuint8mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxseg8ei8(rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) poison, ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, const __bf16 *rs1,
+                                                vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) poison, ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, const __bf16 *rs1,
+                                                vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8(vm, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_m(vbool16_t vm, const __bf16 *rs1,
+                                              vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8(vm, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei32.c
new file mode 100644
index 0000000000000..db440e9a49157
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf4(__bf16 *rs1, vuint32mf2_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf2(__bf16 *rs1, vuint32m1_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m1(__bf16 *rs1, vuint32m2_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m2(__bf16 *rs1, vuint32m4_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m4(__bf16 *rs1, vuint32m8_t rs2, vbfloat16m4_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint32mf2_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei32_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t rs2,
+                              vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsoxei32(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei64.c
new file mode 100644
index 0000000000000..a9c28dd6af613
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf4(__bf16 *rs1, vuint64m1_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf2(__bf16 *rs1, vuint64m2_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsoxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m1(__bf16 *rs1, vuint64m4_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m2(__bf16 *rs1, vuint64m8_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei64_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei64(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei8.c
new file mode 100644
index 0000000000000..7106538c741d3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf4(__bf16 *rs1, vuint8mf8_t rs2, vbfloat16mf4_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf2(__bf16 *rs1, vuint8mf4_t rs2, vbfloat16mf2_t vs3,
+                            size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m1(__bf16 *rs1, vuint8mf2_t rs2, vbfloat16m1_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m2(__bf16 *rs1, vuint8m1_t rs2, vbfloat16m2_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m4(__bf16 *rs1, vuint8m2_t rs2, vbfloat16m4_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m8(__bf16 *rs1, vuint8m4_t rs2, vbfloat16m8_t vs3,
+                           size_t vl) {
+  return __riscv_vsoxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t rs2,
+                              vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t rs2,
+                              vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t rs2,
+                             vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t rs2,
+                             vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t rs2,
+                             vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxei8_v_bf16m8_m(vbool2_t vm, __bf16 *rs1, vuint8m4_t rs2,
+                             vbfloat16m8_t vs3, size_t vl) {
+  return __riscv_vsoxei8(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei32.c
new file mode 100644
index 0000000000000..cc3954a0eecf2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei32.c
@@ -0,0 +1,119 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf4x2(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf2x2(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m1x2(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m2x2(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m4x2(__bf16 *rs1, vuint32m8_t vs2,
+                                  vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x2_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei32_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t vs2,
+                                    vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei64.c
new file mode 100644
index 0000000000000..5b716beb149b7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf4x2(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf2x2(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m1x2(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m2x2(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei64_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei8.c
new file mode 100644
index 0000000000000..c346040ae6da3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf4x2(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf2x2(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m1x2(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m2x2(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m4x2(__bf16 *rs1, vuint8m2_t vs2,
+                                 vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg2ei8_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t vs2,
+                                   vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsoxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei32.c
new file mode 100644
index 0000000000000..c283b82c2f9f9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf4x3(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf2x3(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m1x3(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m2x3(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x3_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei32_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei64.c
new file mode 100644
index 0000000000000..3a7dd53a4999b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf4x3(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf2x3(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m1x3(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m2x3(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei64_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei8.c
new file mode 100644
index 0000000000000..5ae57d256056e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf4x3(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf2x3(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m1x3(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m2x3(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg3ei8_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsoxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei32.c
new file mode 100644
index 0000000000000..23fe189930964
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf4x4(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf2x4(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m1x4(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m2x4(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x4_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei32_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei64.c
new file mode 100644
index 0000000000000..5411c6af8ae7a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf4x4(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf2x4(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m1x4(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m2x4(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei64_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei8.c
new file mode 100644
index 0000000000000..783029bc125a6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf4x4(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf2x4(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m1x4(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m2x4(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg4ei8_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsoxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei32.c
new file mode 100644
index 0000000000000..9f18e9c1a965c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf4x5(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf2x5(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16m1x5(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x5_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei32_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei64.c
new file mode 100644
index 0000000000000..8ea144f8d9882
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf4x5(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf2x5(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16m1x5(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei64_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei8.c
new file mode 100644
index 0000000000000..d61d35ec7ef22
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf4x5(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf2x5(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16m1x5(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg5ei8_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsoxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei32.c
new file mode 100644
index 0000000000000..4050a61ec1d66
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf4x6(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf2x6(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16m1x6(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x6_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei32_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei64.c
new file mode 100644
index 0000000000000..90fc1a66802fb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf4x6(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf2x6(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16m1x6(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei64_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei8.c
new file mode 100644
index 0000000000000..7c9b7fd261ef2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf4x6(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf2x6(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16m1x6(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg6ei8_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsoxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei32.c
new file mode 100644
index 0000000000000..2fa45968720e7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf4x7(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf2x7(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16m1x7(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x7_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei32_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei64.c
new file mode 100644
index 0000000000000..26462c20ab1a1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf4x7(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf2x7(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16m1x7(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei64_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei8.c
new file mode 100644
index 0000000000000..b3709517fd352
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf4x7(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf2x7(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16m1x7(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg7ei8_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsoxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei32.c
new file mode 100644
index 0000000000000..412fdb9e37643
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf4x8(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf2x8(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16m1x8(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x8_t vs3,
+                                     size_t vl) {
+  return __riscv_vsoxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei32_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei64.c
new file mode 100644
index 0000000000000..7f34e54b0a1fb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf4x8(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf2x8(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16m1x8(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei64_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei8.c
new file mode 100644
index 0000000000000..088ca08fca7f4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsoxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf4x8(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf2x8(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16m1x8(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsoxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsoxseg8ei8_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsoxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei32.c
new file mode 100644
index 0000000000000..13a2d299c8f73
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei32.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf4(__bf16 *rs1, vuint32mf2_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf2(__bf16 *rs1, vuint32m1_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m1(__bf16 *rs1, vuint32m2_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m2(__bf16 *rs1, vuint32m4_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m4(__bf16 *rs1, vuint32m8_t rs2, vbfloat16m4_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei32(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint32mf2_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei32(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei32_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei32_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t rs2,
+                              vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsuxei32(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei64.c
new file mode 100644
index 0000000000000..a6c618a2fb760
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf4(__bf16 *rs1, vuint64m1_t rs2, vbfloat16mf4_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf2(__bf16 *rs1, vuint64m2_t rs2, vbfloat16mf2_t vs3,
+                             size_t vl) {
+  return __riscv_vsuxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m1(__bf16 *rs1, vuint64m4_t rs2, vbfloat16m1_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m2(__bf16 *rs1, vuint64m8_t rs2, vbfloat16m2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei64(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t rs2,
+                               vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t rs2,
+                               vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t rs2,
+                              vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei64(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei64_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei64_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t rs2,
+                              vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei64(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei8.c
new file mode 100644
index 0000000000000..67c4d4b952a3a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxei8.c
@@ -0,0 +1,140 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf4(__bf16 *rs1, vuint8mf8_t rs2, vbfloat16mf4_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf2(__bf16 *rs1, vuint8mf4_t rs2, vbfloat16mf2_t vs3,
+                            size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m1(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m1(__bf16 *rs1, vuint8mf2_t rs2, vbfloat16m1_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m2(__bf16 *rs1, vuint8m1_t rs2, vbfloat16m2_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m4(__bf16 *rs1, vuint8m2_t rs2, vbfloat16m4_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m8(__bf16 *rs1, vuint8m4_t rs2, vbfloat16m8_t vs3,
+                           size_t vl) {
+  return __riscv_vsuxei8(rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], <vscale x 1 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t rs2,
+                              vbfloat16mf4_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], <vscale x 2 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16mf2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t rs2,
+                              vbfloat16mf2_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], <vscale x 4 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m1_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t rs2,
+                             vbfloat16m1_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], <vscale x 8 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t rs2,
+                             vbfloat16m2_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], <vscale x 16 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m4_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t rs2,
+                             vbfloat16m4_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxei8_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], <vscale x 32 x bfloat> [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VS3]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxei8_v_bf16m8_m(vbool2_t vm, __bf16 *rs1, vuint8m4_t rs2,
+                             vbfloat16m8_t vs3, size_t vl) {
+  return __riscv_vsuxei8(vm, rs1, rs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei32.c
new file mode 100644
index 0000000000000..9a3b30c9eff1a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei32.c
@@ -0,0 +1,119 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf4x2(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf2x2(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m1x2(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m2x2(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m4x2(__bf16 *rs1, vuint32m8_t vs2,
+                                  vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x2_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei32_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i32> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei32_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint32m8_t vs2,
+                                    vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei64.c
new file mode 100644
index 0000000000000..b2c445730d2fb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf4x2(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf2x2(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m1x2(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m2x2(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei64_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei64_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei8.c
new file mode 100644
index 0000000000000..6b9774c560521
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg2ei8.c
@@ -0,0 +1,118 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf4x2(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf2x2(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m1x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m1x2(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m2x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m2x2(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m4x2(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m4x2(__bf16 *rs1, vuint8m2_t vs2,
+                                 vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf4x2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf4x2_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16mf2x2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16mf2x2_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m1x2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m1x2_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m2x2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m2x2_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg2ei8_v_bf16m4x2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VS3]], ptr [[RS1]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg2ei8_v_bf16m4x2_m(vbool4_t vm, __bf16 *rs1, vuint8m2_t vs2,
+                                   vbfloat16m4x2_t vs3, size_t vl) {
+  return __riscv_vsuxseg2ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei32.c
new file mode 100644
index 0000000000000..402045661c33f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf4x3(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf2x3(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m1x3(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m2x3(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x3_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei32_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei32_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei64.c
new file mode 100644
index 0000000000000..f1ebe0b61966a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf4x3(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf2x3(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m1x3(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m2x3(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei64_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei64_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei8.c
new file mode 100644
index 0000000000000..d0c36534ca869
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg3ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf4x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf4x3(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf2x3(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m1x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m1x3(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m2x3(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m2x3(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf4x3_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf4x3_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16mf2x3_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16mf2x3_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m1x3_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m1x3_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg3ei8_v_bf16m2x3_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg3ei8_v_bf16m2x3_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x3_t vs3, size_t vl) {
+  return __riscv_vsuxseg3ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei32.c
new file mode 100644
index 0000000000000..6b1c29dae50ad
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei32.c
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf4x4(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf2x4(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m1x4(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m2x4(__bf16 *rs1, vuint32m4_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x4_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei32_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i32> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei32_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint32m4_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei64.c
new file mode 100644
index 0000000000000..8c3c04ee76a50
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei64.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf4x4(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf2x4(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m1x4(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m2x4(__bf16 *rs1, vuint64m8_t vs2,
+                                  vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei64_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i64> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei64_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint64m8_t vs2,
+                                    vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei8.c
new file mode 100644
index 0000000000000..b052914453463
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg4ei8.c
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf4x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf4x4(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf2x4(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m1x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m1x4(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m2x4(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m2x4(__bf16 *rs1, vuint8m1_t vs2,
+                                 vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf4x4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf4x4_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16mf2x4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16mf2x4_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m1x4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m1x4_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg4ei8_v_bf16m2x4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VS3]], ptr [[RS1]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg4ei8_v_bf16m2x4_m(vbool8_t vm, __bf16 *rs1, vuint8m1_t vs2,
+                                   vbfloat16m2x4_t vs3, size_t vl) {
+  return __riscv_vsuxseg4ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei32.c
new file mode 100644
index 0000000000000..8e2b11b0bda6f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf4x5(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf2x5(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16m1x5(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x5_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei32_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei32_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei64.c
new file mode 100644
index 0000000000000..e3f24c791744e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf4x5(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf2x5(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16m1x5(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei64_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei64_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei8.c
new file mode 100644
index 0000000000000..1130456ee6cfd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg5ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf4x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf4x5(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf2x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf2x5(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16m1x5(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16m1x5(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf4x5_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf4x5_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16mf2x5_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16mf2x5_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg5ei8_v_bf16m1x5_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg5ei8_v_bf16m1x5_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x5_t vs3, size_t vl) {
+  return __riscv_vsuxseg5ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei32.c
new file mode 100644
index 0000000000000..dc619aa60aef0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf4x6(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf2x6(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16m1x6(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x6_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei32_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei32_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei64.c
new file mode 100644
index 0000000000000..35a776bcda289
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf4x6(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf2x6(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16m1x6(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei64_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei64_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei8.c
new file mode 100644
index 0000000000000..d474a5682b17c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg6ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf4x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf4x6(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf2x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf2x6(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16m1x6(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16m1x6(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf4x6_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf4x6_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16mf2x6_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16mf2x6_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg6ei8_v_bf16m1x6_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg6ei8_v_bf16m1x6_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x6_t vs3, size_t vl) {
+  return __riscv_vsuxseg6ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei32.c
new file mode 100644
index 0000000000000..d2e1859fa300f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf4x7(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf2x7(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16m1x7(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x7_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei32_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei32_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei64.c
new file mode 100644
index 0000000000000..cfc1cfca208c1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf4x7(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf2x7(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16m1x7(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei64_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei64_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei8.c
new file mode 100644
index 0000000000000..a887bf12fd5bf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg7ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf4x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf4x7(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf2x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf2x7(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16m1x7(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16m1x7(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf4x7_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf4x7_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16mf2x7_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16mf2x7_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg7ei8_v_bf16m1x7_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg7ei8_v_bf16m1x7_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x7_t vs3, size_t vl) {
+  return __riscv_vsuxseg7ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei32.c
new file mode 100644
index 0000000000000..38f3f8a3a96e2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei32.c
@@ -0,0 +1,75 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf4x8(__bf16 *rs1, vuint32mf2_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf2x8(__bf16 *rs1, vuint32m1_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16m1x8(__bf16 *rs1, vuint32m2_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i32> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1,
+                                     vuint32mf2_t vs2, vbfloat16mf4x8_t vs3,
+                                     size_t vl) {
+  return __riscv_vsuxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i32> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint32m1_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei32_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i32> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei32_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint32m2_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei32(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei64.c
new file mode 100644
index 0000000000000..4adeaf94608eb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei64.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf4x8(__bf16 *rs1, vuint64m1_t vs2,
+                                   vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf2x8(__bf16 *rs1, vuint64m2_t vs2,
+                                   vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16m1x8(__bf16 *rs1, vuint64m4_t vs2,
+                                  vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i64> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint64m1_t vs2,
+                                     vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i64> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint64m2_t vs2,
+                                     vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei64_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i64> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei64_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint64m4_t vs2,
+                                    vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei64(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei8.c
new file mode 100644
index 0000000000000..25cbcf6887063
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vsuxseg8ei8.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf4x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf4x8(__bf16 *rs1, vuint8mf8_t vs2,
+                                  vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf2x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf2x8(__bf16 *rs1, vuint8mf4_t vs2,
+                                  vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16m1x8(
+// CHECK-RV64-SAME: ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16m1x8(__bf16 *rs1, vuint8mf2_t vs2,
+                                 vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf4x8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf4x8_m(vbool64_t vm, __bf16 *rs1, vuint8mf8_t vs2,
+                                    vbfloat16mf4x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16mf2x8_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16mf2x8_m(vbool32_t vm, __bf16 *rs1, vuint8mf4_t vs2,
+                                    vbfloat16mf2x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_vsuxseg8ei8_v_bf16m1x8_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VS3]], ptr [[RS1]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_vsuxseg8ei8_v_bf16m1x8_m(vbool16_t vm, __bf16 *rs1, vuint8mf2_t vs2,
+                                   vbfloat16m1x8_t vs3, size_t vl) {
+  return __riscv_vsuxseg8ei8(vm, rs1, vs2, vs3, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei32.c
new file mode 100644
index 0000000000000..ec107fa023afa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_v_bf16m4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei64.c
new file mode 100644
index 0000000000000..22081708baf15
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei8.c
new file mode 100644
index 0000000000000..b0b97875ac3ba
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxei8.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tu(vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_v_bf16m8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei32.c
new file mode 100644
index 0000000000000..df494cfe1b233
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tu(vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tumu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_v_bf16m4x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei64.c
new file mode 100644
index 0000000000000..1d7d48a3305c9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei8.c
new file mode 100644
index 0000000000000..44070dd3888c7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg2ei8.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tu(vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tumu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_v_bf16m4x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei32.c
new file mode 100644
index 0000000000000..86b6692126e89
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei32.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei64.c
new file mode 100644
index 0000000000000..960df2840fadd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei8.c
new file mode 100644
index 0000000000000..00d0958458b0f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg3ei8.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei32.c
new file mode 100644
index 0000000000000..0ad8e794bc4bc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei32.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei64.c
new file mode 100644
index 0000000000000..337031e119f7d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei8.c
new file mode 100644
index 0000000000000..f2d9383676af1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg4ei8.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei32.c
new file mode 100644
index 0000000000000..8a43698cc7b8d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei64.c
new file mode 100644
index 0000000000000..90bd04ecaf510
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei8.c
new file mode 100644
index 0000000000000..bd25294cbade4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg5ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei32.c
new file mode 100644
index 0000000000000..017317d5d681a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei64.c
new file mode 100644
index 0000000000000..b835ec1bbcbf6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei8.c
new file mode 100644
index 0000000000000..7f12dd202bed6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg6ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei32.c
new file mode 100644
index 0000000000000..6478fcf7ab914
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei64.c
new file mode 100644
index 0000000000000..986045a6b85b8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei8.c
new file mode 100644
index 0000000000000..bfe80e670df0b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg7ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei32.c
new file mode 100644
index 0000000000000..c5679ae3ca327
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei64.c
new file mode 100644
index 0000000000000..3e87a24394fd0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei8.c
new file mode 100644
index 0000000000000..b4c20694b3599
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vloxseg8ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei32.c
new file mode 100644
index 0000000000000..85d344f63301b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_v_bf16m4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei64.c
new file mode 100644
index 0000000000000..7d5914e241fad
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei8.c
new file mode 100644
index 0000000000000..118843e2adf35
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxei8.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m1_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tu(vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m1_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m1_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16mf2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m1_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, const __bf16 *rs1, vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_v_bf16m8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei32.c
new file mode 100644
index 0000000000000..3428217434ec2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tu(vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tumu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_v_bf16m4x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei64.c
new file mode 100644
index 0000000000000..487ae96284022
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei8.c
new file mode 100644
index 0000000000000..d228b2b6db49d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg2ei8.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tu(vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tu(vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tu(vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tum(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tum(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tum(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tumu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tumu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tumu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tumu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tumu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_mu(vbool64_t vm, vbfloat16mf4x2_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf4x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_mu(vbool32_t vm, vbfloat16mf2x2_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16mf2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m1x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m2x2_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd, const __bf16 *rs1, vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_v_bf16m4x2_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei32.c
new file mode 100644
index 0000000000000..ff211e99a5f4e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei32.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei64.c
new file mode 100644
index 0000000000000..ae79962b81b46
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei8.c
new file mode 100644
index 0000000000000..18c7af2663099
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg3ei8.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tu(vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tu(vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tum(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tum(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tum(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tumu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tumu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tumu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tumu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_mu(vbool64_t vm, vbfloat16mf4x3_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf4x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_mu(vbool32_t vm, vbfloat16mf2x3_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16mf2x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m1x3_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_v_bf16m2x3_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei32.c
new file mode 100644
index 0000000000000..d0c9adf52942c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei32.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei64.c
new file mode 100644
index 0000000000000..b68db5fefe5e8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei64.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei8.c
new file mode 100644
index 0000000000000..1cca5289ab74b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg4ei8.c
@@ -0,0 +1,168 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tu(vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tu(vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tum(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tum(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tum(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tumu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tumu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tumu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tumu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_mu(vbool64_t vm, vbfloat16mf4x4_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf4x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_mu(vbool32_t vm, vbfloat16mf2x4_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16mf2x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m1x4_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd, const __bf16 *rs1, vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_v_bf16m2x4_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei32.c
new file mode 100644
index 0000000000000..3e1d4e325c4a8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei64.c
new file mode 100644
index 0000000000000..cc8c4dfc2b057
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei8.c
new file mode 100644
index 0000000000000..779368d55e95a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg5ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tu(vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tum(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tum(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tum(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tumu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tumu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tumu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_mu(vbool64_t vm, vbfloat16mf4x5_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf4x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_mu(vbool32_t vm, vbfloat16mf2x5_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16mf2x5_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_v_bf16m1x5_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei32.c
new file mode 100644
index 0000000000000..dec5b0af5eab1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei64.c
new file mode 100644
index 0000000000000..463f026e4d897
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei8.c
new file mode 100644
index 0000000000000..88a89bd3c1480
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg6ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tu(vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tum(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tum(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tum(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tumu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tumu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tumu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_mu(vbool64_t vm, vbfloat16mf4x6_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf4x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_mu(vbool32_t vm, vbfloat16mf2x6_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16mf2x6_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_v_bf16m1x6_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei32.c
new file mode 100644
index 0000000000000..f14c2bd126226
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei64.c
new file mode 100644
index 0000000000000..c2cb327af0535
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei8.c
new file mode 100644
index 0000000000000..93b0a5539ff59
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg7ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tu(vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tum(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tum(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tum(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tumu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tumu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tumu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_mu(vbool64_t vm, vbfloat16mf4x7_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf4x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_mu(vbool32_t vm, vbfloat16mf2x7_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16mf2x7_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_v_bf16m1x7_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei32.c
new file mode 100644
index 0000000000000..b0e1656a27139
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei32.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei64.c
new file mode 100644
index 0000000000000..9820438b36135
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei64.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei8.c
new file mode 100644
index 0000000000000..5290e29b6bb05
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vluxseg8ei8.c
@@ -0,0 +1,128 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tu(vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tum(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tum(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tum(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tumu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tumu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tumu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_mu(vbool64_t vm, vbfloat16mf4x8_t vd, const __bf16 *rs1, vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf4x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_mu(vbool32_t vm, vbfloat16mf2x8_t vd, const __bf16 *rs1, vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16mf2x8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd, const __bf16 *rs1, vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_v_bf16m1x8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei32.c
new file mode 100644
index 0000000000000..82ea42b1b5b87
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei32.c
@@ -0,0 +1,243 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                          vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                          vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                        vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                        vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1,
+                                        vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint32mf2_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint32m1_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint32m2_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint32m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                         const __bf16 *rs1, vuint32m8_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            const __bf16 *rs1, vuint32mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            const __bf16 *rs1, vuint32m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          const __bf16 *rs1, vuint32m2_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          const __bf16 *rs1, vuint32m4_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                          const __bf16 *rs1, vuint32m8_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei32_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei32_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint32mf2_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei32_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei32_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint32m1_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei32_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei32_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint32m2_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei32_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei32_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint32m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei32_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei32_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                        const __bf16 *rs1, vuint32m8_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei64.c
new file mode 100644
index 0000000000000..ec6ee7d626b52
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei64.c
@@ -0,0 +1,196 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                          vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                          vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                        vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                        vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint64m1_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint64m2_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint64m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint64m8_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            const __bf16 *rs1, vuint64m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            const __bf16 *rs1, vuint64m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vloxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          const __bf16 *rs1, vuint64m4_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          const __bf16 *rs1, vuint64m8_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei64_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei64_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint64m1_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei64_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei64_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint64m2_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei64_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei64_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint64m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei64_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei64_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint64m8_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei8.c
new file mode 100644
index 0000000000000..93a6c28e4a79c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxei8.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                         vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                         vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                       vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                       vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1,
+                                       vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tu(vbfloat16m8_t vd, const __bf16 *rs1,
+                                       vuint8m4_t rs2, size_t vl) {
+  return __riscv_vloxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint8mf8_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint8mf4_t rs2,
+                                          size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint8mf2_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint8m1_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                        const __bf16 *rs1, vuint8m2_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                        const __bf16 *rs1, vuint8m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vloxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint8mf8_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint8mf4_t rs2,
+                                           size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint8mf2_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint8m1_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                         const __bf16 *rs1, vuint8m2_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                         const __bf16 *rs1, vuint8m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vloxei8_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vloxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vloxei8_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                         const __bf16 *rs1, vuint8mf8_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vloxei8_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vloxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vloxei8_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                         const __bf16 *rs1, vuint8mf4_t rs2,
+                                         size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vloxei8_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vloxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vloxei8_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                       const __bf16 *rs1, vuint8mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vloxei8_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vloxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vloxei8_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                       const __bf16 *rs1, vuint8m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vloxei8_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vloxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vloxei8_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                       const __bf16 *rs1, vuint8m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vloxei8_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vloxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vloxei8_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                       const __bf16 *rs1, vuint8m4_t rs2,
+                                       size_t vl) {
+  return __riscv_vloxei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei32.c
new file mode 100644
index 0000000000000..2dc68cf30319c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei32.c
@@ -0,0 +1,264 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tu(vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tum(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tum(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tum(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tum(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tum(vbool4_t vm,
+                                                 vbfloat16m4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_tumu(vbool16_t vm,
+                                                  vbfloat16m1x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_tumu(vbool8_t vm,
+                                                  vbfloat16m2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_tumu(vbool4_t vm,
+                                                  vbfloat16m4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei32_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei32_v_bf16mf4x2_mu(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei32_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei32_v_bf16mf2x2_mu(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei32_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei32_v_bf16m1x2_mu(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei32_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei32_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei32_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei32_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei64.c
new file mode 100644
index 0000000000000..aebef33ace64a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tum(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tum(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tum(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tum(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_tumu(vbool16_t vm,
+                                                  vbfloat16m1x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_tumu(vbool8_t vm,
+                                                  vbfloat16m2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei64_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei64_v_bf16mf4x2_mu(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei64_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei64_v_bf16mf2x2_mu(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei64_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei64_v_bf16m1x2_mu(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei64_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei64_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei8.c
new file mode 100644
index 0000000000000..fc11aef5bf5e0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg2ei8.c
@@ -0,0 +1,258 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tu(vbfloat16m4x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tum(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tum(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tum(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_tumu(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_tumu(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_tumu(vbool4_t vm,
+                                                 vbfloat16m4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vloxseg2ei8_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vloxseg2ei8_v_bf16mf4x2_mu(vbool64_t vm,
+                                                 vbfloat16mf4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vloxseg2ei8_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vloxseg2ei8_v_bf16mf2x2_mu(vbool32_t vm,
+                                                 vbfloat16mf2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vloxseg2ei8_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vloxseg2ei8_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vloxseg2ei8_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vloxseg2ei8_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vloxseg2ei8_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vloxseg2ei8_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m2_t rs2, size_t vl) {
+  return __riscv_vloxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei32.c
new file mode 100644
index 0000000000000..ac1ac4aba264e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei32.c
@@ -0,0 +1,214 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tum(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tum(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tum(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tum(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_tumu(vbool16_t vm,
+                                                  vbfloat16m1x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_tumu(vbool8_t vm,
+                                                  vbfloat16m2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei32_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei32_v_bf16mf4x3_mu(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei32_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei32_v_bf16mf2x3_mu(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei32_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei32_v_bf16m1x3_mu(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei32_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei32_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei64.c
new file mode 100644
index 0000000000000..86519aafc1d35
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tum(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tum(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tum(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tum(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_tumu(vbool16_t vm,
+                                                  vbfloat16m1x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_tumu(vbool8_t vm,
+                                                  vbfloat16m2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei64_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei64_v_bf16mf4x3_mu(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei64_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei64_v_bf16mf2x3_mu(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei64_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei64_v_bf16m1x3_mu(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei64_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei64_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei8.c
new file mode 100644
index 0000000000000..92ea80dfd2f45
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg3ei8.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tum(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tum(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tum(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_tumu(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_tumu(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vloxseg3ei8_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vloxseg3ei8_v_bf16mf4x3_mu(vbool64_t vm,
+                                                 vbfloat16mf4x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vloxseg3ei8_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vloxseg3ei8_v_bf16mf2x3_mu(vbool32_t vm,
+                                                 vbfloat16mf2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vloxseg3ei8_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vloxseg3ei8_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vloxseg3ei8_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vloxseg3ei8_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei32.c
new file mode 100644
index 0000000000000..7cb17187f7013
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei32.c
@@ -0,0 +1,214 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tum(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tum(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tum(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tum(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_tumu(vbool16_t vm,
+                                                  vbfloat16m1x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_tumu(vbool8_t vm,
+                                                  vbfloat16m2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei32_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei32_v_bf16mf4x4_mu(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei32_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei32_v_bf16mf2x4_mu(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei32_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei32_v_bf16m1x4_mu(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei32_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei32_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei64.c
new file mode 100644
index 0000000000000..054d04ff16da1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tum(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tum(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tum(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tum(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_tumu(vbool16_t vm,
+                                                  vbfloat16m1x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_tumu(vbool8_t vm,
+                                                  vbfloat16m2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei64_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei64_v_bf16mf4x4_mu(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei64_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei64_v_bf16mf2x4_mu(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei64_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei64_v_bf16m1x4_mu(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei64_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei64_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei8.c
new file mode 100644
index 0000000000000..f1571233cdd9e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg4ei8.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tum(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tum(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tum(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_tumu(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_tumu(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vloxseg4ei8_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vloxseg4ei8_v_bf16mf4x4_mu(vbool64_t vm,
+                                                 vbfloat16mf4x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vloxseg4ei8_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vloxseg4ei8_v_bf16mf2x4_mu(vbool32_t vm,
+                                                 vbfloat16mf2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vloxseg4ei8_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vloxseg4ei8_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vloxseg4ei8_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vloxseg4ei8_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vloxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei32.c
new file mode 100644
index 0000000000000..3f57c002fe4c2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tum(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tum(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tum(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_tumu(vbool16_t vm,
+                                                  vbfloat16m1x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei32_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei32_v_bf16mf4x5_mu(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei32_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei32_v_bf16mf2x5_mu(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei32_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei32_v_bf16m1x5_mu(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei64.c
new file mode 100644
index 0000000000000..fb9850cb0bff9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tum(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tum(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tum(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_tumu(vbool16_t vm,
+                                                  vbfloat16m1x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei64_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei64_v_bf16mf4x5_mu(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei64_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei64_v_bf16mf2x5_mu(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei64_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei64_v_bf16m1x5_mu(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei8.c
new file mode 100644
index 0000000000000..06f3e8ffbf712
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg5ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tum(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tum(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tum(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_tumu(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vloxseg5ei8_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vloxseg5ei8_v_bf16mf4x5_mu(vbool64_t vm,
+                                                 vbfloat16mf4x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vloxseg5ei8_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vloxseg5ei8_v_bf16mf2x5_mu(vbool32_t vm,
+                                                 vbfloat16mf2x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vloxseg5ei8_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vloxseg5ei8_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei32.c
new file mode 100644
index 0000000000000..94e44d09e8313
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tum(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tum(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tum(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_tumu(vbool16_t vm,
+                                                  vbfloat16m1x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei32_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei32_v_bf16mf4x6_mu(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei32_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei32_v_bf16mf2x6_mu(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei32_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei32_v_bf16m1x6_mu(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei64.c
new file mode 100644
index 0000000000000..2981b18c491d4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tum(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tum(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tum(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_tumu(vbool16_t vm,
+                                                  vbfloat16m1x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei64_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei64_v_bf16mf4x6_mu(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei64_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei64_v_bf16mf2x6_mu(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei64_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei64_v_bf16m1x6_mu(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei8.c
new file mode 100644
index 0000000000000..23fa390aef0dd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg6ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tum(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tum(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tum(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_tumu(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vloxseg6ei8_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vloxseg6ei8_v_bf16mf4x6_mu(vbool64_t vm,
+                                                 vbfloat16mf4x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vloxseg6ei8_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vloxseg6ei8_v_bf16mf2x6_mu(vbool32_t vm,
+                                                 vbfloat16mf2x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vloxseg6ei8_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vloxseg6ei8_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei32.c
new file mode 100644
index 0000000000000..f3293d0398585
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tum(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tum(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tum(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_tumu(vbool16_t vm,
+                                                  vbfloat16m1x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei32_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei32_v_bf16mf4x7_mu(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei32_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei32_v_bf16mf2x7_mu(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei32_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei32_v_bf16m1x7_mu(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei64.c
new file mode 100644
index 0000000000000..10209cc2192ed
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tum(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tum(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tum(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_tumu(vbool16_t vm,
+                                                  vbfloat16m1x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei64_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei64_v_bf16mf4x7_mu(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei64_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei64_v_bf16mf2x7_mu(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei64_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei64_v_bf16m1x7_mu(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei8.c
new file mode 100644
index 0000000000000..8f03ff1603085
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg7ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tum(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tum(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tum(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_tumu(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vloxseg7ei8_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vloxseg7ei8_v_bf16mf4x7_mu(vbool64_t vm,
+                                                 vbfloat16mf4x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vloxseg7ei8_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vloxseg7ei8_v_bf16mf2x7_mu(vbool32_t vm,
+                                                 vbfloat16mf2x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vloxseg7ei8_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vloxseg7ei8_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei32.c
new file mode 100644
index 0000000000000..a8c0d4909af09
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tum(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vloxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tum(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tum(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_tumu(vbool16_t vm,
+                                                  vbfloat16m1x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei32_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei32_v_bf16mf4x8_mu(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei32_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei32_v_bf16mf2x8_mu(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei32_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei32_v_bf16m1x8_mu(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei64.c
new file mode 100644
index 0000000000000..08c8fdc56c9cb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tum(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tum(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tum(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vloxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_tumu(vbool16_t vm,
+                                                  vbfloat16m1x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei64_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei64_v_bf16mf4x8_mu(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei64_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei64_v_bf16mf2x8_mu(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei64_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei64_v_bf16m1x8_mu(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei8.c
new file mode 100644
index 0000000000000..247dcfb578294
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vloxseg8ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tum(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tum(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tum(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_tumu(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vloxseg8ei8_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vloxseg8ei8_v_bf16mf4x8_mu(vbool64_t vm,
+                                                 vbfloat16mf4x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vloxseg8ei8_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vloxseg8ei8_v_bf16mf2x8_mu(vbool32_t vm,
+                                                 vbfloat16mf2x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vloxseg8ei8_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vloxseg8ei8_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vloxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei32.c
new file mode 100644
index 0000000000000..eef643f85e038
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei32.c
@@ -0,0 +1,243 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                          vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                          vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                        vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                        vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1,
+                                        vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint32mf2_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint32m1_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint32m2_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint32m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                         const __bf16 *rs1, vuint32m8_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            const __bf16 *rs1, vuint32mf2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            const __bf16 *rs1, vuint32m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          const __bf16 *rs1, vuint32m2_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          const __bf16 *rs1, vuint32m4_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                          const __bf16 *rs1, vuint32m8_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei32_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i32.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei32_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint32mf2_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei32_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i32.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei32_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint32m1_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei32_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i32.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei32_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint32m2_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei32_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i32.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei32_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint32m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei32_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i32.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei32_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                        const __bf16 *rs1, vuint32m8_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei64.c
new file mode 100644
index 0000000000000..08eac74a0163b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei64.c
@@ -0,0 +1,196 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                          vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                          vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                        vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                        vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint64m1_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint64m2_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint64m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint64m8_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            const __bf16 *rs1, vuint64m1_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            const __bf16 *rs1, vuint64m2_t rs2,
+                                            size_t vl) {
+  return __riscv_vluxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          const __bf16 *rs1, vuint64m4_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          const __bf16 *rs1, vuint64m8_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei64_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i64.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei64_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint64m1_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei64_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i64.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei64_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint64m2_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei64_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i64.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei64_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint64m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei64_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i64.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei64_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint64m8_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei8.c
new file mode 100644
index 0000000000000..88c00a6c86423
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxei8.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tu(vbfloat16mf4_t vd, const __bf16 *rs1,
+                                         vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tu(vbfloat16mf2_t vd, const __bf16 *rs1,
+                                         vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tu(vbfloat16m1_t vd, const __bf16 *rs1,
+                                       vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tu(vbfloat16m2_t vd, const __bf16 *rs1,
+                                       vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tu(vbfloat16m4_t vd, const __bf16 *rs1,
+                                       vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tu(vbfloat16m8_t vd, const __bf16 *rs1,
+                                       vuint8m4_t rs2, size_t vl) {
+  return __riscv_vluxei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                          const __bf16 *rs1, vuint8mf8_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                          const __bf16 *rs1, vuint8mf4_t rs2,
+                                          size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                        const __bf16 *rs1, vuint8mf2_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                        const __bf16 *rs1, vuint8m1_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                        const __bf16 *rs1, vuint8m2_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                        const __bf16 *rs1, vuint8m4_t rs2,
+                                        size_t vl) {
+  return __riscv_vluxei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                           const __bf16 *rs1, vuint8mf8_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                           const __bf16 *rs1, vuint8mf4_t rs2,
+                                           size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                         const __bf16 *rs1, vuint8mf2_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                         const __bf16 *rs1, vuint8m1_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                         const __bf16 *rs1, vuint8m2_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                         const __bf16 *rs1, vuint8m4_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vluxei8_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vluxei.mask.nxv1bf16.p0.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vluxei8_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                         const __bf16 *rs1, vuint8mf8_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vluxei8_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vluxei.mask.nxv2bf16.p0.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vluxei8_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                         const __bf16 *rs1, vuint8mf4_t rs2,
+                                         size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vluxei8_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vluxei.mask.nxv4bf16.p0.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vluxei8_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                       const __bf16 *rs1, vuint8mf2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vluxei8_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vluxei.mask.nxv8bf16.p0.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vluxei8_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                       const __bf16 *rs1, vuint8m1_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vluxei8_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vluxei.mask.nxv16bf16.p0.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vluxei8_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                       const __bf16 *rs1, vuint8m2_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vluxei8_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 32 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vluxei.mask.nxv32bf16.p0.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], ptr [[RS1]], <vscale x 32 x i8> [[RS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vluxei8_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                       const __bf16 *rs1, vuint8m4_t rs2,
+                                       size_t vl) {
+  return __riscv_vluxei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei32.c
new file mode 100644
index 0000000000000..a6d5aa949b4d8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei32.c
@@ -0,0 +1,264 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tu(vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tum(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tum(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tum(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tum(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tum(vbool4_t vm,
+                                                 vbfloat16m4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_tumu(vbool16_t vm,
+                                                  vbfloat16m1x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_tumu(vbool8_t vm,
+                                                  vbfloat16m2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_tumu(vbool4_t vm,
+                                                  vbfloat16m4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei32_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei32_v_bf16mf4x2_mu(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei32_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei32_v_bf16mf2x2_mu(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei32_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei32_v_bf16m1x2_mu(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei32_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei32_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei32_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i32.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i32> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei32_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei64.c
new file mode 100644
index 0000000000000..f19c96521a7a6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tum(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tum(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tum(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tum(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x2_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_tumu(vbool16_t vm,
+                                                  vbfloat16m1x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_tumu(vbool8_t vm,
+                                                  vbfloat16m2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei64_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei64_v_bf16mf4x2_mu(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei64_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei64_v_bf16mf2x2_mu(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei64_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei64_v_bf16m1x2_mu(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei64_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei64_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei8.c
new file mode 100644
index 0000000000000..07e820458e709
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg2ei8.c
@@ -0,0 +1,258 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tu(vbfloat16mf4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tu(vbfloat16mf2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tu(vbfloat16m1x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tu(vbfloat16m2x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tu(vbfloat16m4x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tum(vbool64_t vm,
+                                                  vbfloat16mf4x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tum(vbool32_t vm,
+                                                  vbfloat16mf2x2_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tum(vbool16_t vm,
+                                                vbfloat16m1x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tum(vbool8_t vm, vbfloat16m2x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tum(vbool4_t vm, vbfloat16m4x2_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x2_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_tumu(vbool16_t vm,
+                                                 vbfloat16m1x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_tumu(vbool8_t vm,
+                                                 vbfloat16m2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_tumu(vbool4_t vm,
+                                                 vbfloat16m4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vluxseg2ei8_v_bf16mf4x2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf4x2_t test_vluxseg2ei8_v_bf16mf4x2_mu(vbool64_t vm,
+                                                 vbfloat16mf4x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vluxseg2ei8_v_bf16mf2x2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) [[TMP0]]
+//
+vbfloat16mf2x2_t test_vluxseg2ei8_v_bf16mf2x2_mu(vbool32_t vm,
+                                                 vbfloat16mf2x2_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vluxseg2ei8_v_bf16m1x2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP0]]
+//
+vbfloat16m1x2_t test_vluxseg2ei8_v_bf16m1x2_mu(vbool16_t vm, vbfloat16m1x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vluxseg2ei8_v_bf16m2x2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv16i8_2t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) [[TMP0]]
+//
+vbfloat16m2x2_t test_vluxseg2ei8_v_bf16m2x2_mu(vbool8_t vm, vbfloat16m2x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vluxseg2ei8_v_bf16m4x2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 16 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv16i8.nxv16i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[VD]], ptr [[RS1]], <vscale x 16 x i8> [[RS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP0]]
+//
+vbfloat16m4x2_t test_vluxseg2ei8_v_bf16m4x2_mu(vbool4_t vm, vbfloat16m4x2_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m2_t rs2, size_t vl) {
+  return __riscv_vluxseg2ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei32.c
new file mode 100644
index 0000000000000..05bece790e068
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei32.c
@@ -0,0 +1,214 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tum(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tum(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tum(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tum(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_tumu(vbool16_t vm,
+                                                  vbfloat16m1x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_tumu(vbool8_t vm,
+                                                  vbfloat16m2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei32_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei32_v_bf16mf4x3_mu(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei32_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei32_v_bf16mf2x3_mu(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei32_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei32_v_bf16m1x3_mu(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei32_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei32_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei64.c
new file mode 100644
index 0000000000000..16898e7e27471
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tum(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tum(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tum(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tum(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x3_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_tumu(vbool16_t vm,
+                                                  vbfloat16m1x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_tumu(vbool8_t vm,
+                                                  vbfloat16m2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei64_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei64_v_bf16mf4x3_mu(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei64_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei64_v_bf16mf2x3_mu(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei64_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei64_v_bf16m1x3_mu(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei64_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei64_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei8.c
new file mode 100644
index 0000000000000..b48554b75eadc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg3ei8.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tu(vbfloat16mf4x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tu(vbfloat16mf2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tu(vbfloat16m1x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tu(vbfloat16m2x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tum(vbool64_t vm,
+                                                  vbfloat16mf4x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tum(vbool32_t vm,
+                                                  vbfloat16mf2x3_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tum(vbool16_t vm,
+                                                vbfloat16m1x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tum(vbool8_t vm, vbfloat16m2x3_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x3_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_tumu(vbool16_t vm,
+                                                 vbfloat16m1x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_tumu(vbool8_t vm,
+                                                 vbfloat16m2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vluxseg3ei8_v_bf16mf4x3_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf4x3_t test_vluxseg3ei8_v_bf16mf4x3_mu(vbool64_t vm,
+                                                 vbfloat16mf4x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vluxseg3ei8_v_bf16mf2x3_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) [[TMP0]]
+//
+vbfloat16mf2x3_t test_vluxseg3ei8_v_bf16mf2x3_mu(vbool32_t vm,
+                                                 vbfloat16mf2x3_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vluxseg3ei8_v_bf16m1x3_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP0]]
+//
+vbfloat16m1x3_t test_vluxseg3ei8_v_bf16m1x3_mu(vbool16_t vm, vbfloat16m1x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vluxseg3ei8_v_bf16m2x3_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) [[TMP0]]
+//
+vbfloat16m2x3_t test_vluxseg3ei8_v_bf16m2x3_mu(vbool8_t vm, vbfloat16m2x3_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg3ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei32.c
new file mode 100644
index 0000000000000..dfd314f8f23ed
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei32.c
@@ -0,0 +1,214 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tum(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tum(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tum(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tum(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_tumu(vbool16_t vm,
+                                                  vbfloat16m1x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_tumu(vbool8_t vm,
+                                                  vbfloat16m2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei32_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei32_v_bf16mf4x4_mu(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei32_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei32_v_bf16mf2x4_mu(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei32_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei32_v_bf16m1x4_mu(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei32_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i32.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i32> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei32_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei64.c
new file mode 100644
index 0000000000000..a8f779af29cd8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei64.c
@@ -0,0 +1,213 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tum(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tum(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tum(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tum(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x4_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_tumu(vbool16_t vm,
+                                                  vbfloat16m1x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_tumu(vbool8_t vm,
+                                                  vbfloat16m2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei64_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei64_v_bf16mf4x4_mu(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei64_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei64_v_bf16mf2x4_mu(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei64_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei64_v_bf16m1x4_mu(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei64_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i64.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i64> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei64_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei8.c
new file mode 100644
index 0000000000000..b3fc409391c6a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg4ei8.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tu(vbfloat16mf4x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tu(vbfloat16mf2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tu(vbfloat16m1x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tu(vbfloat16m2x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tum(vbool64_t vm,
+                                                  vbfloat16mf4x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tum(vbool32_t vm,
+                                                  vbfloat16mf2x4_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tum(vbool16_t vm,
+                                                vbfloat16m1x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tum(vbool8_t vm, vbfloat16m2x4_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x4_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_tumu(vbool16_t vm,
+                                                 vbfloat16m1x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_tumu(vbool8_t vm,
+                                                 vbfloat16m2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vluxseg4ei8_v_bf16mf4x4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf4x4_t test_vluxseg4ei8_v_bf16mf4x4_mu(vbool64_t vm,
+                                                 vbfloat16mf4x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vluxseg4ei8_v_bf16mf2x4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) [[TMP0]]
+//
+vbfloat16mf2x4_t test_vluxseg4ei8_v_bf16mf2x4_mu(vbool32_t vm,
+                                                 vbfloat16mf2x4_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vluxseg4ei8_v_bf16m1x4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP0]]
+//
+vbfloat16m1x4_t test_vluxseg4ei8_v_bf16m1x4_mu(vbool16_t vm, vbfloat16m1x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vluxseg4ei8_v_bf16m2x4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 8 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv8i8.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[VD]], ptr [[RS1]], <vscale x 8 x i8> [[RS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP0]]
+//
+vbfloat16m2x4_t test_vluxseg4ei8_v_bf16m2x4_mu(vbool8_t vm, vbfloat16m2x4_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8m1_t rs2, size_t vl) {
+  return __riscv_vluxseg4ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei32.c
new file mode 100644
index 0000000000000..f40d2b8e162e9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tum(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tum(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tum(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_tumu(vbool16_t vm,
+                                                  vbfloat16m1x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei32_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei32_v_bf16mf4x5_mu(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei32_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei32_v_bf16mf2x5_mu(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei32_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei32_v_bf16m1x5_mu(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei64.c
new file mode 100644
index 0000000000000..da3efe94fafbf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tum(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tum(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tum(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x5_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_tumu(vbool16_t vm,
+                                                  vbfloat16m1x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei64_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei64_v_bf16mf4x5_mu(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei64_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei64_v_bf16mf2x5_mu(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei64_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei64_v_bf16m1x5_mu(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei8.c
new file mode 100644
index 0000000000000..422a271e583aa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg5ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tu(vbfloat16mf4x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tu(vbfloat16mf2x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tu(vbfloat16m1x5_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tum(vbool64_t vm,
+                                                  vbfloat16mf4x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tum(vbool32_t vm,
+                                                  vbfloat16mf2x5_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tum(vbool16_t vm,
+                                                vbfloat16m1x5_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x5_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_tumu(vbool16_t vm,
+                                                 vbfloat16m1x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vluxseg5ei8_v_bf16mf4x5_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf4x5_t test_vluxseg5ei8_v_bf16mf4x5_mu(vbool64_t vm,
+                                                 vbfloat16mf4x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vluxseg5ei8_v_bf16mf2x5_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) [[TMP0]]
+//
+vbfloat16mf2x5_t test_vluxseg5ei8_v_bf16mf2x5_mu(vbool32_t vm,
+                                                 vbfloat16mf2x5_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vluxseg5ei8_v_bf16m1x5_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP0]]
+//
+vbfloat16m1x5_t test_vluxseg5ei8_v_bf16m1x5_mu(vbool16_t vm, vbfloat16m1x5_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg5ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei32.c
new file mode 100644
index 0000000000000..ecdd9cc8ff315
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tum(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tum(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tum(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_tumu(vbool16_t vm,
+                                                  vbfloat16m1x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei32_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei32_v_bf16mf4x6_mu(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei32_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei32_v_bf16mf2x6_mu(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei32_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei32_v_bf16m1x6_mu(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei64.c
new file mode 100644
index 0000000000000..d428a81cfddc8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tum(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tum(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tum(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x6_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_tumu(vbool16_t vm,
+                                                  vbfloat16m1x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei64_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei64_v_bf16mf4x6_mu(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei64_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei64_v_bf16mf2x6_mu(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei64_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei64_v_bf16m1x6_mu(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei8.c
new file mode 100644
index 0000000000000..cb38825634e43
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg6ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tu(vbfloat16mf4x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tu(vbfloat16mf2x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tu(vbfloat16m1x6_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tum(vbool64_t vm,
+                                                  vbfloat16mf4x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tum(vbool32_t vm,
+                                                  vbfloat16mf2x6_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tum(vbool16_t vm,
+                                                vbfloat16m1x6_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x6_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_tumu(vbool16_t vm,
+                                                 vbfloat16m1x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vluxseg6ei8_v_bf16mf4x6_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf4x6_t test_vluxseg6ei8_v_bf16mf4x6_mu(vbool64_t vm,
+                                                 vbfloat16mf4x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vluxseg6ei8_v_bf16mf2x6_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) [[TMP0]]
+//
+vbfloat16mf2x6_t test_vluxseg6ei8_v_bf16mf2x6_mu(vbool32_t vm,
+                                                 vbfloat16mf2x6_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vluxseg6ei8_v_bf16m1x6_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP0]]
+//
+vbfloat16m1x6_t test_vluxseg6ei8_v_bf16m1x6_mu(vbool16_t vm, vbfloat16m1x6_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg6ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei32.c
new file mode 100644
index 0000000000000..c446be91583f5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tum(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tum(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tum(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_tumu(vbool16_t vm,
+                                                  vbfloat16m1x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei32_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei32_v_bf16mf4x7_mu(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei32_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei32_v_bf16mf2x7_mu(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei32_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei32_v_bf16m1x7_mu(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei64.c
new file mode 100644
index 0000000000000..06f159dab8630
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tum(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tum(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tum(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x7_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_tumu(vbool16_t vm,
+                                                  vbfloat16m1x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei64_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei64_v_bf16mf4x7_mu(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei64_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei64_v_bf16mf2x7_mu(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei64_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei64_v_bf16m1x7_mu(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei8.c
new file mode 100644
index 0000000000000..8f91c7d642553
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg7ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tu(vbfloat16mf4x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tu(vbfloat16mf2x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tu(vbfloat16m1x7_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tum(vbool64_t vm,
+                                                  vbfloat16mf4x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tum(vbool32_t vm,
+                                                  vbfloat16mf2x7_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tum(vbool16_t vm,
+                                                vbfloat16m1x7_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x7_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_tumu(vbool16_t vm,
+                                                 vbfloat16m1x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vluxseg7ei8_v_bf16mf4x7_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf4x7_t test_vluxseg7ei8_v_bf16mf4x7_mu(vbool64_t vm,
+                                                 vbfloat16mf4x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vluxseg7ei8_v_bf16mf2x7_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) [[TMP0]]
+//
+vbfloat16mf2x7_t test_vluxseg7ei8_v_bf16mf2x7_mu(vbool32_t vm,
+                                                 vbfloat16mf2x7_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vluxseg7ei8_v_bf16m1x7_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP0]]
+//
+vbfloat16m1x7_t test_vluxseg7ei8_v_bf16m1x7_mu(vbool16_t vm, vbfloat16m1x7_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg7ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei32.c
new file mode 100644
index 0000000000000..0fef431e84c08
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei32.c
@@ -0,0 +1,164 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tum(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32mf2_t rs2,
+                                                   size_t vl) {
+  return __riscv_vluxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tum(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tum(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32mf2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint32m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_tumu(vbool16_t vm,
+                                                  vbfloat16m1x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei32_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i32.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i32> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei32_v_bf16mf4x8_mu(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei32_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i32.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i32> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei32_v_bf16mf2x8_mu(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint32m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei32_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i32> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i32.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i32> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei32_v_bf16m1x8_mu(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint32m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei32_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei64.c
new file mode 100644
index 0000000000000..5e941b53727de
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei64.c
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tum(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tum(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tum(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                    vbfloat16mf4x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m1_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                    vbfloat16mf2x8_t vd,
+                                                    const __bf16 *rs1,
+                                                    vuint64m2_t rs2,
+                                                    size_t vl) {
+  return __riscv_vluxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_tumu(vbool16_t vm,
+                                                  vbfloat16m1x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei64_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i64.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i64> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei64_v_bf16mf4x8_mu(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m1_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei64_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i64.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i64> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei64_v_bf16mf2x8_mu(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint64m2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei64_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i64> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i64.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i64> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei64_v_bf16m1x8_mu(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint64m4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei64_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei8.c
new file mode 100644
index 0000000000000..49cb2aa8921de
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vluxseg8ei8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tu(vbfloat16mf4x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tu(vbfloat16mf2x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tu(
+// CHECK-RV64-SAME: target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], i64 [[VL]], i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tu(vbfloat16m1x8_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tu(vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tum(vbool64_t vm,
+                                                  vbfloat16mf4x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tum(vbool32_t vm,
+                                                  vbfloat16mf2x8_t vd,
+                                                  const __bf16 *rs1,
+                                                  vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tum(vbool16_t vm,
+                                                vbfloat16m1x8_t vd,
+                                                const __bf16 *rs1,
+                                                vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tum(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_tumu(vbool64_t vm,
+                                                   vbfloat16mf4x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_tumu(vbool32_t vm,
+                                                   vbfloat16mf2x8_t vd,
+                                                   const __bf16 *rs1,
+                                                   vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_tumu(vbool16_t vm,
+                                                 vbfloat16m1x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_tumu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vluxseg8ei8_v_bf16mf4x8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 1 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.p0.nxv1i8.nxv1i1.i64(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 1 x i8> [[RS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf4x8_t test_vluxseg8ei8_v_bf16mf4x8_mu(vbool64_t vm,
+                                                 vbfloat16mf4x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf8_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vluxseg8ei8_v_bf16mf2x8_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 2 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv2i8.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 2 x i8> [[RS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) [[TMP0]]
+//
+vbfloat16mf2x8_t test_vluxseg8ei8_v_bf16mf2x8_mu(vbool32_t vm,
+                                                 vbfloat16mf2x8_t vd,
+                                                 const __bf16 *rs1,
+                                                 vuint8mf4_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vluxseg8ei8_v_bf16m1x8_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD:%.*]], ptr noundef [[RS1:%.*]], <vscale x 4 x i8> [[RS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv4i8.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[VD]], ptr [[RS1]], <vscale x 4 x i8> [[RS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1, i64 4)
+// CHECK-RV64-NEXT:    ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP0]]
+//
+vbfloat16m1x8_t test_vluxseg8ei8_v_bf16m1x8_mu(vbool16_t vm, vbfloat16m1x8_t vd,
+                                               const __bf16 *rs1,
+                                               vuint8mf2_t rs2, size_t vl) {
+  return __riscv_vluxseg8ei8_mu(vm, vd, rs1, rs2, vl);
+}
diff --git a/clang/test/CodeGen/enum3.c b/clang/test/CodeGen/enum3.c
new file mode 100644
index 0000000000000..6878a0bbb94d0
--- /dev/null
+++ b/clang/test/CodeGen/enum3.c
@@ -0,0 +1,26 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-error=underlying-atomic-qualifier-ignored -std=c23 %s -emit-llvm -o - | FileCheck %s
+
+// Ensure that an "atomic" underlying type has no actual atomic semantics
+// because the qualifier is stripped.
+
+enum E : _Atomic(int) {
+  Foo
+};
+
+// CHECK-LABEL: define {{.*}} void @test(
+// CHECK-SAME: i32 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[E_ADDR:%.*]] = alloca i32
+// CHECK-NEXT:    [[X:%.*]] = alloca i32
+// CHECK-NEXT:    store i32 [[E]], ptr [[E_ADDR]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[E_ADDR]]
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[X]]
+// CHECK-NEXT:    store i32 0, ptr [[E_ADDR]]
+// CHECK-NEXT:    ret void
+//
+void test(enum E e) {
+  int x = e;
+  e = Foo;
+}
+
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index bbb55b7e14941..bfbed79dc7f16 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -209,9 +209,9 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
 // NOCPU-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
 // NOCPU-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
-// NOCPU-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT:    store i64 100, ptr [[TMP18]], align 8
-// NOCPU-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// NOCPU-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT:    store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// NOCPU-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
 // NOCPU-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
 // NOCPU-NEXT:    store i32 32, ptr [[BLOCK_SIZE22]], align 8
 // NOCPU-NEXT:    [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -587,9 +587,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// GFX900-NEXT:    store i64 100, ptr [[TMP18]], align 8
-// GFX900-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT:    store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// GFX900-NEXT:    [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
 // GFX900-NEXT:    [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 421099d3876e3..a1b91d0cc38dc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -24,6 +24,24 @@ void test_s_monitor_sleep() {
   __builtin_amdgcn_s_monitor_sleep(10);
 }
 
+// CHECK-LABEL: @test_s_wait_asynccnt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
+// CHECK-NEXT:    ret void
+//
+void test_s_wait_asynccnt() {
+  __builtin_amdgcn_s_wait_asynccnt(0);
+}
+
+// CHECK-LABEL: @test_s_wait_tensorcnt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
+// CHECK-NEXT:    ret void
+//
+void test_s_wait_tensorcnt() {
+  __builtin_amdgcn_s_wait_tensorcnt(0);
+}
+
 // CHECK-LABEL: @test_cvt_f16_fp8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index 0dd7eb0c738db..f00940bd7613d 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -44,8 +44,8 @@
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // -pg -fomit-frame-pointer => error.
-// RUN: not %clang -### -S -fomit-frame-pointer -pg %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-OMIT-FP-PG %s
-// RUN: %clang -### -S -fomit-frame-pointer -fno-omit-frame-pointer -pg %s 2>&1 | FileCheck -check-prefix=CHECK-MIX-NO-OMIT-FP-PG %s
+// RUN: not %clang -### --target=i386-linux -S -fomit-frame-pointer -pg %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-OMIT-FP-PG %s
+// RUN: %clang -### --target=i386-linux -S -fomit-frame-pointer -fno-omit-frame-pointer -pg %s 2>&1 | FileCheck -check-prefix=CHECK-MIX-NO-OMIT-FP-PG %s
 // CHECK-NO-MIX-OMIT-FP-PG: '-fomit-frame-pointer' not allowed with '-pg'
 // CHECK-MIX-NO-OMIT-FP-PG-NOT: '-fomit-frame-pointer' not allowed with '-pg'
 
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index 62e7c9588ce66..77f4cfb5f3a43 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -395,3 +395,12 @@
 // RUN:     --offload-arch=sm_52 -foffload-lto=thin -nogpulib -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=THINLTO-SM52 %s
 // THINLTO-SM52: --device-compiler=nvptx64-nvidia-cuda=-flto=thin
+
+//
+// Check the requested architecture is passed if provided.
+//
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     --offload-arch=gfx906 -foffload-lto=thin -nogpulib -nogpuinc %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=SHOULD-EXTRACT %s
+//
+// SHOULD-EXTRACT: clang-linker-wrapper{{.*}}"--should-extract=gfx906"
diff --git a/clang/test/OpenMP/parallel_ast_print.cpp b/clang/test/OpenMP/parallel_ast_print.cpp
index 948baaff30d89..15439ea31215a 100644
--- a/clang/test/OpenMP/parallel_ast_print.cpp
+++ b/clang/test/OpenMP/parallel_ast_print.cpp
@@ -173,13 +173,13 @@ T tmain(T argc, T *argv) {
   foo();
 #endif
 #ifdef OMP60
-#pragma omp parallel default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(strict: C) copyin(S<T>::TS, thrp) proc_bind(primary) reduction(+:c, arr1[argc]) reduction(max:e, arr[:C][0:10])
+#pragma omp parallel default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(strict: C) copyin(S<T>::TS, thrp) proc_bind(primary) reduction(+:c, arr1[argc]) reduction(max:e, arr[:C][0:10]) message("msg") severity(fatal)
   foo();
 #endif
 #pragma omp parallel if (C) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(default, && : g) reduction(task,+:argc)
   foo();
 #ifdef OMP60
-#pragma omp parallel if (C) num_threads(strict: s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(default, && : g) reduction(task,+:argc)
+#pragma omp parallel if (C) num_threads(strict: s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(default, && : g) reduction(task,+:argc) message("msg") severity(warning)
   foo();
 #endif
   return 0;
@@ -196,11 +196,11 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // OMP51-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(C) copyin(S<T>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10])
 // OMP51-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: C) copyin(S<T>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10])
+// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: C) copyin(S<T>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10]) message("msg") severity(fatal)
 // OMP60-NEXT: foo()
 // CHECK-NEXT: #pragma omp parallel if(C) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(default, &&: g) reduction(task, +: argc)
 // CHECK-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel if(C) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(default, &&: g) reduction(task, +: argc)
+// OMP60-NEXT: #pragma omp parallel if(C) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(default, &&: g) reduction(task, +: argc) message("msg") severity(warning)
 // OMP60-NEXT: foo()
 // CHECK: template<> int tmain<int, 5>(int argc, int *argv) {
 // CHECK-NEXT: int b = argc, c, d, e, f, g;
@@ -213,11 +213,11 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // OMP51-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) copyin(S<int>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
 // OMP51-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: 5) copyin(S<int>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: 5) copyin(S<int>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10]) message("msg") severity(fatal)
 // OMP60-NEXT: foo()
 // CHECK-NEXT: #pragma omp parallel if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(default, &&: g) reduction(task, +: argc)
 // CHECK-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel if(5) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(default, &&: g) reduction(task, +: argc)
+// OMP60-NEXT: #pragma omp parallel if(5) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(default, &&: g) reduction(task, +: argc) message("msg") severity(warning)
 // OMP60-NEXT: foo()
 // CHECK: template<> long tmain<long, 1>(long argc, long *argv) {
 // CHECK-NEXT: long b = argc, c, d, e, f, g;
@@ -230,11 +230,11 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // OMP51-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(1) copyin(S<long>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10])
 // OMP51-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: 1) copyin(S<long>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10])
+// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(strict: 1) copyin(S<long>::TS,thrp) proc_bind(primary) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10]) message("msg") severity(fatal)
 // OMP60-NEXT: foo()
 // CHECK-NEXT: #pragma omp parallel if(1) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(default, &&: g) reduction(task, +: argc)
 // CHECK-NEXT: foo()
-// OMP60-NEXT: #pragma omp parallel if(1) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(default, &&: g) reduction(task, +: argc)
+// OMP60-NEXT: #pragma omp parallel if(1) num_threads(strict: s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(default, &&: g) reduction(task, +: argc) message("msg") severity(warning)
 // OMP60-NEXT: foo()
 
 enum Enum { };
@@ -256,8 +256,8 @@ int main (int argc, char **argv) {
   foo();
 // CHECK-NEXT: foo();
 #ifdef OMP60
-#pragma omp parallel default(none), private(argc,b) firstprivate(argv) if (parallel: argc > 0) num_threads(strict: ee) copyin(a) proc_bind(spread) reduction(| : c, d, arr1[argc]) reduction(* : e, arr[:10][0:argc]) allocate(e)
-// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) if(parallel: argc > 0) num_threads(strict: ee) copyin(a) proc_bind(spread) reduction(|: c,d,arr1[argc]) reduction(*: e,arr[:10][0:argc]) allocate(e)
+#pragma omp parallel default(none), private(argc,b) firstprivate(argv) if (parallel: argc > 0) num_threads(strict: ee) copyin(a) proc_bind(spread) reduction(| : c, d, arr1[argc]) reduction(* : e, arr[:10][0:argc]) allocate(e) message("msg") severity(fatal)
+// OMP60-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) if(parallel: argc > 0) num_threads(strict: ee) copyin(a) proc_bind(spread) reduction(|: c,d,arr1[argc]) reduction(*: e,arr[:10][0:argc]) allocate(e) message("msg") severity(fatal)
   foo();
 // OMP60-NEXT: foo();
 #endif
@@ -266,8 +266,8 @@ int main (int argc, char **argv) {
   foo();
 // CHECK-NEXT: foo()
 #ifdef OMP60
-#pragma omp parallel allocate(e) if (b) num_threads(strict: c) proc_bind(close) reduction(^:e, f) reduction(&& : g, arr[0:argc][:10])
-// OMP60-NEXT: #pragma omp parallel allocate(e) if(b) num_threads(strict: c) proc_bind(close) reduction(^: e,f) reduction(&&: g,arr[0:argc][:10])
+#pragma omp parallel allocate(e) if (b) num_threads(strict: c) proc_bind(close) reduction(^:e, f) reduction(&& : g, arr[0:argc][:10]) message("msg") severity(warning)
+// OMP60-NEXT: #pragma omp parallel allocate(e) if(b) num_threads(strict: c) proc_bind(close) reduction(^: e,f) reduction(&&: g,arr[0:argc][:10]) message("msg") severity(warning)
   foo();
 // OMP60-NEXT: foo()
 #endif
diff --git a/clang/test/OpenMP/parallel_message_messages.cpp b/clang/test/OpenMP/parallel_message_messages.cpp
new file mode 100644
index 0000000000000..470fadc032280
--- /dev/null
+++ b/clang/test/OpenMP/parallel_message_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify=expected -fopenmp -fopenmp-version=60 -ferror-limit 100 %s -Wuninitialized
+// RUN: %clang_cc1 -verify=expected -fopenmp-simd -fopenmp-version=60 -ferror-limit 100 %s -Wuninitialized
+
+void foo() {}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  // Correct usage
+  #pragma omp parallel message("correct message")
+
+  // Missing parentheses
+  #pragma omp parallel message // expected-error {{expected '(' after 'message'}}
+  
+  // Empty parentheses
+  #pragma omp parallel message() // expected-error {{expected expression}}
+
+  // Non-string literal
+  #pragma omp parallel message(123) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+  #pragma omp parallel message(argc) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+  #pragma omp parallel message(argv[0]) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+
+  // Multiple arguments
+  #pragma omp parallel message("msg1", "msg2") // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  
+  // Unterminated string
+  // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-warning@+1 {{missing terminating '"' character}} expected-note@+1 {{to match this '('}}
+  #pragma omp parallel message("unterminated
+
+  // Unterminated clause
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp parallel message("msg"
+
+  // Extra tokens after clause
+  #pragma omp parallel message("msg") extra // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+
+  // Multiple message clauses
+  #pragma omp parallel message("msg1") message("msg2") // expected-error {{directive '#pragma omp parallel' cannot contain more than one 'message' clause}}
+
+  // Message clause with other clauses (should be valid, but test for interaction)
+  #pragma omp parallel message("msg") num_threads(2)
+
+  // Message clause with invalid clause
+  #pragma omp parallel message("msg") invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+
+  // Message clause with missing string and other clause
+  #pragma omp parallel message() num_threads(2) // expected-error {{expected expression}}
+
+  // Message clause with macro that is not a string
+  #define NOT_A_STRING 123
+  #pragma omp parallel message(NOT_A_STRING) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+
+  // Message clause with template parameter that is not a string
+  #pragma omp parallel message(N) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+
+  // Message clause with macro that is a string
+  #define A_STRING "macro string"
+  #pragma omp parallel message(A_STRING)
+
+  // Message clause with concatenated string literals
+  #pragma omp parallel message("hello" " world")
+
+  // Message clause with wide string literal
+  #pragma omp parallel message(L"wide string")
+
+  // Message clause with UTF-8 string literal
+  #pragma omp parallel message(u8"utf8 string")
+
+  // Message clause with raw string literal
+  #pragma omp parallel message(R"(raw string)")
+
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  // Correct usage
+  #pragma omp parallel message("main correct")
+
+  // Invalid: missing string
+  #pragma omp parallel message() // expected-error {{expression}}
+
+  // Invalid: non-string
+  #pragma omp parallel message(argc) // expected-warning {{expected string literal in 'clause message' - ignoring}}
+
+  foo();
+
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/clang/test/OpenMP/parallel_severity_messages.cpp b/clang/test/OpenMP/parallel_severity_messages.cpp
new file mode 100644
index 0000000000000..b1cff762d9bd8
--- /dev/null
+++ b/clang/test/OpenMP/parallel_severity_messages.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -verify=expected -fopenmp -fopenmp-version=60 -ferror-limit 100 %s -Wuninitialized
+// RUN: %clang_cc1 -verify=expected -fopenmp-simd -fopenmp-version=60 -ferror-limit 100 %s -Wuninitialized
+
+void foo() {}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  // Correct usages
+  #pragma omp parallel severity(fatal)
+  #pragma omp parallel severity(warning)
+
+  // Missing parentheses
+  #pragma omp parallel severity // expected-error {{expected '(' after 'severity'}}
+
+  // Empty parentheses
+  #pragma omp parallel severity() // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  // Invalid value
+  #pragma omp parallel severity(error) // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+  #pragma omp parallel severity(unknown) // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  // Multiple arguments
+  #pragma omp parallel severity(fatal, warning) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+
+  // Unterminated clause
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp parallel severity(fatal
+
+  // Extra tokens after clause
+  #pragma omp parallel severity(fatal) extra // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+
+  // Multiple severity clauses
+  #pragma omp parallel severity(fatal) severity(warning) // expected-error {{directive '#pragma omp parallel' cannot contain more than one 'severity' clause}}
+
+  // Severity clause with other clauses (should be valid)
+  #pragma omp parallel severity(warning) num_threads(2)
+
+  // Severity clause with invalid clause
+  #pragma omp parallel severity(fatal) invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}}
+
+  // Severity clause with macro that is not a valid value
+  #define NOT_A_SEVERITY 123
+  #pragma omp parallel severity(NOT_A_SEVERITY) // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  // Severity clause with macro that is a valid value
+  #define FATAL fatal
+  #pragma omp parallel severity(FATAL)
+
+  // Severity clause with template parameter that is not a valid value
+  #pragma omp parallel severity(N) // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  // Correct usage
+  #pragma omp parallel severity(fatal)
+
+  // Invalid: missing value
+  #pragma omp parallel severity() // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  // Invalid: non-keyword
+  #pragma omp parallel severity(argc) // expected-error {{expected 'fatal' or 'warning' in OpenMP clause 'severity'}}
+
+  foo();
+
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 86d51820ae5b5..e82d825704439 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2556,25 +2556,25 @@
 
 // RUN: %clang -march=sierraforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32,CHECK_KL_M32
 // RUN: %clang -march=grandridge -m32 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32,CHECK_KL_M32
 // RUN: %clang -march=arrowlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ARL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_KL_M32
 // RUN: %clang -march=arrowlake-s -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_KL_M32
 // RUN: %clang -march=lunarlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_KL_M32
 // RUN: %clang -march=pantherlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_PTL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_NKL_M32
 // RUN: %clang -march=clearwaterforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_CWF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_CWF_M32,CHECK_NKL_M32
 // CHECK_ARL_M32: #define __ADX__ 1
 // CHECK_ARL_M32: #define __AES__ 1
 // CHECK_ARL_M32: #define __AVX2__ 1
@@ -2601,7 +2601,8 @@
 // CHECK_ARL_M32: #define __GFNI__ 1
 // CHECK_ARL_M32: #define __HRESET__ 1
 // CHECK_ARL_M32: #define __INVPCID__ 1
-// CHECK_ARL_M32: #define __KL__ 1
+// CHECK_KL_M32:  #define __KL__ 1
+// CHECK_NKL_M32-NOT: __KL__
 // CHECK_ARL_M32: #define __LZCNT__ 1
 // CHECK_ARL_M32: #define __MMX__ 1
 // CHECK_ARL_M32: #define __MOVBE__ 1
@@ -2645,7 +2646,8 @@
 // CHECK_ARL_M32: #define __VAES__ 1
 // CHECK_ARL_M32: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M32: #define __WAITPKG__ 1
-// CHECK_ARL_M32: #define __WIDEKL__ 1
+// CHECK_KL_M32:  #define __WIDEKL__ 1
+// CHECK_NKL_M32-NOT: __WIDEKL__
 // CHECK_ARL_M32: #define __XSAVEC__ 1
 // CHECK_ARL_M32: #define __XSAVEOPT__ 1
 // CHECK_ARL_M32: #define __XSAVES__ 1
@@ -2659,25 +2661,25 @@
 
 // RUN: %clang -march=sierraforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_KL_M64
 // RUN: %clang -march=grandridge -m64 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_KL_M64
 // RUN: %clang -march=arrowlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ARL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_KL_M64
 // RUN: %clang -march=arrowlake-s -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_KL_M64
 // RUN: %clang -march=lunarlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_KL_M64
 // RUN: %clang -march=pantherlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_PTL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_NKL_M64
 // RUN: %clang -march=clearwaterforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_CWF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_CWF_M64,CHECK_NKL_M64
 // CHECK_ARL_M64: #define __ADX__ 1
 // CHECK_ARL_M64: #define __AES__ 1
 // CHECK_ARL_M64: #define __AVX2__ 1
@@ -2704,7 +2706,8 @@
 // CHECK_ARL_M64: #define __GFNI__ 1
 // CHECK_ARL_M64: #define __HRESET__ 1
 // CHECK_ARL_M64: #define __INVPCID__ 1
-// CHECK_ARL_M64: #define __KL__ 1
+// CHECK_KL_M64:  #define __KL__ 1
+// CHECK_NKL_M64-NOT: __KL__
 // CHECK_ARL_M64: #define __LZCNT__ 1
 // CHECK_ARL_M64: #define __MMX__ 1
 // CHECK_ARL_M64: #define __MOVBE__ 1
@@ -2749,7 +2752,8 @@
 // CHECK_ARL_M64: #define __VAES__ 1
 // CHECK_ARL_M64: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M64: #define __WAITPKG__ 1
-// CHECK_ARL_M64: #define __WIDEKL__ 1
+// CHECK_KL_M64:  #define __WIDEKL__ 1
+// CHECK_NKL_M64-NOT: __WIDEKL__
 // CHECK_ARL_M64: #define __XSAVEC__ 1
 // CHECK_ARL_M64: #define __XSAVEOPT__ 1
 // CHECK_ARL_M64: #define __XSAVES__ 1
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16.cpp
new file mode 100644
index 0000000000000..e7d1aa045edf9
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16.cpp
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
+__attribute__((target("bf16")))
+void test_bf16(svbool_t pg, svfloat32_t svf32, svbfloat16_t svbf16, bfloat16_t bf16) MODE_ATTR
+{
+  svbfdot_f32(svf32, svbf16, svbf16);
+  svbfdot_n_f32(svf32, svbf16, bf16);
+  svbfdot_lane_f32(svf32, svbf16, svbf16, 0);
+
+  svbfmlalb_f32(svf32, svbf16, svbf16);
+  svbfmlalb_n_f32(svf32, svbf16, bf16);
+  svbfmlalb_lane_f32(svf32, svbf16, svbf16, 0);
+
+  svbfmlalt_f32(svf32, svbf16, svbf16);
+  svbfmlalt_n_f32(svf32, svbf16, bf16);
+  svbfmlalt_lane_f32(svf32, svbf16, svbf16, 0);
+
+  svcvt_bf16_f32_m(svbf16, pg, svf32);
+  svcvt_bf16_f32_x(pg, svf32);
+  svcvt_bf16_f32_z(pg, svf32);
+
+  svcvtnt_bf16_f32_m(svbf16, pg, svf32);
+  svcvtnt_bf16_f32_x(svbf16, pg, svf32);
+}
+
+void test_no_bf16(svbool_t pg, svfloat32_t svf32, svbfloat16_t svbf16, bfloat16_t bf16) MODE_ATTR
+{
+  // expected-error@+1 {{'svbfdot_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfdot_f32(svf32, svbf16, svbf16);
+  // expected-error@+1 {{'svbfdot_n_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfdot_n_f32(svf32, svbf16, bf16);
+  // expected-error@+1 {{'svbfdot_lane_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfdot_lane_f32(svf32, svbf16, svbf16, 0);
+
+  // expected-error@+1 {{'svbfmlalb_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalb_f32(svf32, svbf16, svbf16);
+  // expected-error@+1 {{'svbfmlalb_n_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalb_n_f32(svf32, svbf16, bf16);
+  // expected-error@+1 {{'svbfmlalb_lane_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalb_lane_f32(svf32, svbf16, svbf16, 0);
+
+  // expected-error@+1 {{'svbfmlalt_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalt_f32(svf32, svbf16, svbf16);
+  // expected-error@+1 {{'svbfmlalt_n_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalt_n_f32(svf32, svbf16, bf16);
+  // expected-error@+1 {{'svbfmlalt_lane_f32' needs target feature (sve,bf16)|(sme,bf16)}}
+  svbfmlalt_lane_f32(svf32, svbf16, svbf16, 0);
+
+  // expected-error@+1 {{'svcvt_bf16_f32_m' needs target feature (sve,bf16)|(sme,bf16)}}
+  svcvt_bf16_f32_m(svbf16, pg, svf32);
+  // expected-error@+1 {{'svcvt_bf16_f32_x' needs target feature (sve,bf16)|(sme,bf16)}}
+  svcvt_bf16_f32_x(pg, svf32);
+  // expected-error@+1 {{'svcvt_bf16_f32_z' needs target feature (sve,bf16)|(sme,bf16)}}
+  svcvt_bf16_f32_z(pg, svf32);
+
+  // expected-error@+1 {{'svcvtnt_bf16_f32_m' needs target feature (sve,bf16)|(sme,bf16)}}
+  svcvtnt_bf16_f32_m(svbf16, pg, svf32);
+  // NOTE: svcvtnt_bf16_f32_x is a macro that expands to svcvtnt_bf16_f32_m.
+  // expected-error@+1 {{'svcvtnt_bf16_f32_m' needs target feature (sve,bf16)|(sme,bf16)}}
+  svcvtnt_bf16_f32_x(svbf16, pg, svf32);
+}
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16_non_streaming_only.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16_non_streaming_only.cpp
new file mode 100644
index 0000000000000..1960130fa2145
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bf16_non_streaming_only.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+__attribute__((target("bf16")))
+void test_bf16(svfloat32_t svf32, svbfloat16_t svbf16)
+{
+  svbfmmla_f32(svf32, svbf16, svbf16);
+}
+
+void test_no_bf16(svfloat32_t svf32, svbfloat16_t svbf16)
+{
+  // expected-error@+1 {{'svbfmmla_f32' needs target feature sve,bf16}}
+  svbfmmla_f32(svf32, svbf16, svbf16);
+}
+
+__attribute__((target("sme,bf16")))
+void test_bf16_streaming(svfloat32_t svf32, svbfloat16_t svbf16) __arm_streaming
+{
+  // expected-error@+1 {{builtin can only be called from a non-streaming function}}
+  svbfmmla_f32(svf32, svbf16, svbf16);
+}
diff --git a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp b/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
deleted file mode 100644
index fcdd0516ed5a9..0000000000000
--- a/clang/test/Sema/aarch64-sve-intrinsics/acle_sve_bfloat.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// REQUIRES: aarch64-registered-target
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-
-#include <arm_sve.h>
-
-void test_bfloat(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t *const_bf16_ptr, bfloat16_t *bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x3_t bf16x3, svbfloat16x4_t bf16x4)
-{
-  // expected-error@+1 {{'svcreate2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svcreate2_bf16(bf16, bf16);
-  // expected-error@+1 {{'svcreate3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svcreate3_bf16(bf16, bf16, bf16);
-  // expected-error@+1 {{'svcreate4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svcreate4_bf16(bf16, bf16, bf16, bf16);
-  // expected-error@+1 {{'svget2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svget2_bf16(bf16x2, 1);
-  // expected-error@+1 {{'svget3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svget3_bf16(bf16x3, 1);
-  // expected-error@+1 {{'svget4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svget4_bf16(bf16x4, 1);
-  // expected-error@+1 {{'svld1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svld1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svld1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svld1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svld1rq_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svld1rq_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svldff1_bf16' needs target feature sve,bf16}}
-  svldff1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svldff1_vnum_bf16' needs target feature sve,bf16}}
-  svldff1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svldnf1_bf16' needs target feature sve,bf16}}
-  svldnf1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svldnf1_vnum_bf16' needs target feature sve,bf16}}
-  svldnf1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svldnt1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svldnt1_bf16(pg, const_bf16_ptr);
-  // expected-error@+1 {{'svldnt1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svldnt1_vnum_bf16(pg, const_bf16_ptr, i64);
-  // expected-error@+1 {{'svrev_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svrev_bf16(bf16);
-  // expected-error@+1 {{'svset2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svset2_bf16(bf16x2, 1, bf16);
-  // expected-error@+1 {{'svset3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svset3_bf16(bf16x3, 1, bf16);
-  // expected-error@+1 {{'svset4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svset4_bf16(bf16x4, 1, bf16);
-  // expected-error@+1 {{'svst1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svst1_bf16(pg, bf16_ptr, bf16);
-  // expected-error@+1 {{'svst1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svst1_vnum_bf16(pg, bf16_ptr, i64, bf16);
-  // expected-error@+1 {{'svstnt1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svstnt1_bf16(pg, bf16_ptr, bf16);
-  // expected-error@+1 {{'svstnt1_vnum_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svstnt1_vnum_bf16(pg, bf16_ptr, i64, bf16);
-  // expected-error@+1 {{'svtrn1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svtrn1_bf16(bf16, bf16);
-  // expected-error@+1 {{'svtrn1q_bf16' needs target feature sve,bf16}}
-  svtrn1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svtrn2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svtrn2_bf16(bf16, bf16);
-  // expected-error@+1 {{'svtrn2q_bf16' needs target feature sve,bf16}}
-  svtrn2q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svundef_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svundef_bf16();
-  // expected-error@+1 {{'svundef2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svundef2_bf16();
-  // expected-error@+1 {{'svundef3_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svundef3_bf16();
-  // expected-error@+1 {{'svundef4_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svundef4_bf16();
-  // expected-error@+1 {{'svuzp1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svuzp1_bf16(bf16, bf16);
-  // expected-error@+1 {{'svuzp1q_bf16' needs target feature sve,bf16}}
-  svuzp1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svuzp2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svuzp2_bf16(bf16, bf16);
-  // expected-error@+1 {{'svuzp2q_bf16' needs target feature sve,bf16}}
-  svuzp2q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip1_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svzip1_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip1q_bf16' needs target feature sve,bf16}}
-  svzip1q_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip2_bf16' needs target feature (sve,bf16)|(sme,bf16)}}
-  svzip2_bf16(bf16, bf16);
-  // expected-error@+1 {{'svzip2q_bf16' needs target feature sve,bf16}}
-  svzip2q_bf16(bf16, bf16);
-}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
deleted file mode 100644
index 4a2f8238caf0e..0000000000000
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_bfloat.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// REQUIRES: aarch64-registered-target
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -verify=overload -verify-ignore-unexpected=error,note -emit-llvm -o - %s
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-#include <arm_sve.h>
-
-void test_bfloat(const bfloat16_t *const_bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2)
-{
-  // expected-error@+2 {{'svwhilerw_bf16' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  // overload-error@+1 {{'svwhilerw' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  SVE_ACLE_FUNC(svwhilerw,_bf16,,)(const_bf16_ptr, const_bf16_ptr);
-  // expected-error@+2 {{'svtbx_bf16' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  // overload-error@+1 {{'svtbx' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  SVE_ACLE_FUNC(svtbx,_bf16,,)(bf16, bf16, svundef_u16());
-  // expected-error@+2 {{'svtbl2_bf16' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  // overload-error@+1 {{'svtbl2' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  SVE_ACLE_FUNC(svtbl2,_bf16,,)(bf16x2, svundef_u16());
-  // expected-error@+2 {{'svwhilewr_bf16' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  // overload-error@+1 {{'svwhilewr' needs target feature (sve,sve2,bf16)|(sme,bf16)}}
-  SVE_ACLE_FUNC(svwhilewr,_bf16,,)(const_bf16_ptr, const_bf16_ptr);
-}
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant-builtins-vector.cpp
similarity index 99%
rename from clang/test/Sema/constant_builtins_vector.cpp
rename to clang/test/Sema/constant-builtins-vector.cpp
index f26dfb25d49b9..bde5c478b2b6f 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant-builtins-vector.cpp
@@ -27,6 +27,7 @@ typedef unsigned long long vector4ulong __attribute__((__vector_size__(32)));
 typedef unsigned int vector4uint __attribute__((__vector_size__(16)));
 typedef short vector4short __attribute__((__vector_size__(8)));
 typedef char vector4char __attribute__((__vector_size__(4)));
+typedef unsigned char vector4uchar __attribute__((__vector_size__(4)));
 typedef BitInt8 vector4BitInt8 __attribute__((__vector_size__(4)));
 typedef BitInt32 vector4BitInt32 __attribute__((__vector_size__(16)));
 typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
@@ -848,6 +849,7 @@ static_assert(__builtin_elementwise_add_sat(~(1 << 31), 42) == ~(1 << 31));
 static_assert(__builtin_elementwise_add_sat((1 << 31), -42) == (1 << 31));
 static_assert(__builtin_elementwise_add_sat(~0U, 1U) == ~0U);
 static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_add_sat((vector4char){1, 2, 3, 4}, (vector4char){1, 2, 3, 4})) == (LITTLE_END ? 0x08060402 : 0x02040608));
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_add_sat((vector4uchar){1, 2, 3, 4}, (vector4uchar){0, 1, 2, 3})) == (LITTLE_END ? 0x07050301U : 0x01030507U));
 static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_add_sat((vector4short){(short)0x8000, (short)0x8001, (short)0x8002, (short)0x8003}, (vector4short){-7, -8, -9, -10}) == (LITTLE_END ? 0x8000800080008000 : 0x8000800080008000)));
 
 static_assert(__builtin_elementwise_sub_sat(1, 2) == -1);
@@ -856,4 +858,5 @@ static_assert(__builtin_elementwise_sub_sat(~(1 << 31), -42) == ~(1 << 31));
 static_assert(__builtin_elementwise_sub_sat((1 << 31), 42) == (1 << 31));
 static_assert(__builtin_elementwise_sub_sat(0U, 1U) == 0U);
 static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_sub_sat((vector4char){5, 4, 3, 2}, (vector4char){1, 1, 1, 1})) == (LITTLE_END ? 0x01020304 : 0x04030201));
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_sub_sat((vector4uchar){5, 4, 3, 2}, (vector4uchar){1, 1, 1, 1})) == (LITTLE_END ? 0x01020304U : 0x04030201U));
 static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_sub_sat((vector4short){(short)0x8000, (short)0x8001, (short)0x8002, (short)0x8003}, (vector4short){7, 8, 9, 10}) == (LITTLE_END ? 0x8000800080008000 : 0x8000800080008000)));
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
new file mode 100644
index 0000000000000..64ecba8faac9e
--- /dev/null
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -0,0 +1,192 @@
+// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+// REQUIRES: asserts
+
+struct MyObj {
+  int id;
+  ~MyObj() {} // Non-trivial destructor
+};
+
+// Simple Local Variable Address and Return
+// CHECK-LABEL: Function: return_local_addr
+MyObj* return_local_addr() {
+  MyObj x {10};
+  MyObj* p = &x;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue (LoanID: [[L_X:[0-9]+]], OriginID: [[O_ADDR_X:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_X]])
+  return p;
+// CHECK:   AssignOrigin (DestID: [[O_RET_VAL:[0-9]+]], SrcID: [[O_P]])
+// CHECK:   ReturnOfOrigin (OriginID: [[O_RET_VAL]])
+// CHECK:   Expire (LoanID: [[L_X]])
+}
+
+
+// Pointer Assignment and Return
+// CHECK-LABEL: Function: assign_and_return_local_addr
+// CHECK-NEXT: Block B{{[0-9]+}}:
+MyObj* assign_and_return_local_addr() {
+  MyObj y{20};
+  MyObj* ptr1 = &y;
+// CHECK: Issue (LoanID: [[L_Y:[0-9]+]], OriginID: [[O_ADDR_Y:[0-9]+]])
+// CHECK: AssignOrigin (DestID: [[O_PTR1:[0-9]+]], SrcID: [[O_ADDR_Y]])
+  MyObj* ptr2 = ptr1;
+// CHECK: AssignOrigin (DestID: [[O_PTR1_RVAL:[0-9]+]], SrcID: [[O_PTR1]])
+// CHECK: AssignOrigin (DestID: [[O_PTR2:[0-9]+]], SrcID: [[O_PTR1_RVAL]])
+  ptr2 = ptr1;
+// CHECK: AssignOrigin (DestID: [[O_PTR1_RVAL_2:[0-9]+]], SrcID: [[O_PTR1]])
+// CHECK: AssignOrigin (DestID: [[O_PTR2]], SrcID: [[O_PTR1_RVAL_2]])
+  ptr2 = ptr2; // Self assignment.
+// CHECK: AssignOrigin (DestID: [[O_PTR2_RVAL:[0-9]+]], SrcID: [[O_PTR2]])
+// CHECK: AssignOrigin (DestID: [[O_PTR2]], SrcID: [[O_PTR2_RVAL]])
+  return ptr2;
+// CHECK: AssignOrigin (DestID: [[O_PTR2_RVAL_2:[0-9]+]], SrcID: [[O_PTR2]])
+// CHECK: ReturnOfOrigin (OriginID: [[O_PTR2_RVAL_2]])
+// CHECK: Expire (LoanID: [[L_Y]])
+}
+
+
+// Return of Non-Pointer Type
+// CHECK-LABEL: Function: return_int_val
+// CHECK-NEXT: Block B{{[0-9]+}}:
+int return_int_val() {
+  int x = 10;
+  return x;
+}
+// CHECK-NEXT: End of Block
+
+
+// Loan Expiration (Automatic Variable, C++)
+// CHECK-LABEL: Function: loan_expires_cpp
+// CHECK-NEXT: Block B{{[0-9]+}}:
+void loan_expires_cpp() {
+  MyObj obj{1};
+  MyObj* pObj = &obj;
+// CHECK: Issue (LoanID: [[L_OBJ:[0-9]+]], OriginID: [[O_ADDR_OBJ:[0-9]+]])
+// CHECK: AssignOrigin (DestID: [[O_POBJ:[0-9]+]], SrcID: [[O_ADDR_OBJ]])
+// CHECK: Expire (LoanID: [[L_OBJ]])
+}
+
+
+// FIXME: No expire for Trivial Destructors
+// CHECK-LABEL: Function: loan_expires_trivial
+// CHECK-NEXT: Block B{{[0-9]+}}:
+void loan_expires_trivial() {
+  int trivial_obj = 1;
+  int* pTrivialObj = &trivial_obj;
+// CHECK: Issue (LoanID: [[L_TRIVIAL_OBJ:[0-9]+]], OriginID: [[O_ADDR_TRIVIAL_OBJ:[0-9]+]])
+// CHECK: AssignOrigin (DestID: [[O_PTOBJ:[0-9]+]], SrcID: [[O_ADDR_TRIVIAL_OBJ]])
+// CHECK-NOT: Expire (LoanID: [[L_TRIVIAL_OBJ]])
+// CHECK-NEXT: End of Block
+  // FIXME: Add check for Expire once trivial destructors are handled for expiration.
+}
+
+
+// CHECK-LABEL: Function: conditional
+void conditional(bool condition) {
+  int a = 5;
+  int b = 10;
+  int* p = nullptr;
+
+  if (condition)
+    p = &a;
+  // CHECK: Issue (LoanID: [[L_A:[0-9]+]], OriginID: [[O_ADDR_A:[0-9]+]])
+  // CHECK: AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_A]])
+  else
+    p = &b;
+  // CHECK: Issue (LoanID: [[L_B:[0-9]+]], OriginID: [[O_ADDR_B:[0-9]+]])
+  // CHECK: AssignOrigin (DestID: [[O_P]], SrcID: [[O_ADDR_B]])
+  int *q = p;
+  // CHECK: AssignOrigin (DestID: [[O_P_RVAL:[0-9]+]], SrcID: [[O_P]])
+  // CHECK: AssignOrigin (DestID: [[O_Q:[0-9]+]], SrcID: [[O_P_RVAL]])
+}
+
+
+// CHECK-LABEL: Function: overwrite_origin
+void overwrite_origin() {
+  MyObj s1;
+  MyObj s2;
+  MyObj* p = &s1;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue (LoanID: [[L_S1:[0-9]+]], OriginID: [[O_ADDR_S1:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_S1]])
+  p = &s2;
+// CHECK:   Issue (LoanID: [[L_S2:[0-9]+]], OriginID: [[O_ADDR_S2:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P]], SrcID: [[O_ADDR_S2]])
+// CHECK:   Expire (LoanID: [[L_S2]])
+// CHECK:   Expire (LoanID: [[L_S1]])
+}
+
+
+// CHECK-LABEL: Function: reassign_to_null
+void reassign_to_null() {
+  MyObj s1;
+  MyObj* p = &s1;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue (LoanID: [[L_S1:[0-9]+]], OriginID: [[O_ADDR_S1:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_S1]])
+  p = nullptr;
+// CHECK:   AssignOrigin (DestID: [[O_P]], SrcID: [[O_NULLPTR:[0-9]+]])
+// CHECK:   Expire (LoanID: [[L_S1]])
+}
+// FIXME: Have a better representation for nullptr than just an empty origin. 
+//        It should be a separate loan and origin kind.
+
+
+// CHECK-LABEL: Function: reassign_in_if
+void reassign_in_if(bool condition) {
+  MyObj s1;
+  MyObj s2;
+  MyObj* p = &s1;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue (LoanID: [[L_S1:[0-9]+]], OriginID: [[O_ADDR_S1:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_S1]])
+  if (condition) {
+    p = &s2;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Issue (LoanID: [[L_S2:[0-9]+]], OriginID: [[O_ADDR_S2:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P]], SrcID: [[O_ADDR_S2]])
+  }
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   Expire (LoanID: [[L_S2]])
+// CHECK:   Expire (LoanID: [[L_S1]])
+}
+
+
+// CHECK-LABEL: Function: nested_scopes
+void nested_scopes() {
+  MyObj* p = nullptr;
+// CHECK: Block B{{[0-9]+}}:
+// CHECK:   AssignOrigin (DestID: [[O_NULLPTR_CAST:[0-9]+]], SrcID: [[O_NULLPTR:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_NULLPTR_CAST]])
+  {
+    MyObj outer;
+    p = &outer;
+// CHECK:   Issue (LoanID: [[L_OUTER:[0-9]+]], OriginID: [[O_ADDR_OUTER:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P]], SrcID: [[O_ADDR_OUTER]])
+    {
+      MyObj inner;
+      p = &inner;
+// CHECK:   Issue (LoanID: [[L_INNER:[0-9]+]], OriginID: [[O_ADDR_INNER:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P]], SrcID: [[O_ADDR_INNER]])
+    }
+// CHECK:   Expire (LoanID: [[L_INNER]])
+  }
+// CHECK:   Expire (LoanID: [[L_OUTER]])
+}
+
+
+// CHECK-LABEL: Function: pointer_indirection
+void pointer_indirection() {
+  int a;
+  int *p = &a;
+// CHECK: Block B1:
+// CHECK:   Issue (LoanID: [[L_A:[0-9]+]], OriginID: [[O_ADDR_A:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_P:[0-9]+]], SrcID: [[O_ADDR_A]])
+  int **pp = &p;
+// CHECK:   Issue (LoanID: [[L_P:[0-9]+]], OriginID: [[O_ADDR_P:[0-9]+]])
+// CHECK:   AssignOrigin (DestID: [[O_PP:[0-9]+]], SrcID: [[O_ADDR_P]])
+
+// FIXME: The Origin for the RHS is broken
+  int *q = *pp;
+// CHECK:   AssignOrigin (DestID: [[O_Q:[0-9]+]], SrcID: {{[0-9]+}})
+}
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index b02930e145700..6bf2f441e83c4 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -60,3 +60,17 @@ concept atomicish = requires() {
 };
 atomicish<int> f(); // expected-error {{expected 'auto' or 'decltype(auto)' after concept name}}
 } // namespace GH138820
+
+namespace GH138823 {
+  template <typename T> void foo();
+  template <class... Ts>
+  concept ConceptA = requires { foo<Ts>(); };
+  // expected-error@-1 {{expression contains unexpanded parameter pack 'Ts'}}
+
+  template <class>
+  concept ConceptB = ConceptA<int>;
+
+  template <ConceptB Foo> void bar(Foo);
+
+  void test() { bar(1); }
+}
diff --git a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
index 33a6039459484..aa8d055e44971 100644
--- a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
+++ b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
@@ -3,6 +3,10 @@
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -fcxx-exceptions
 // RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -fblocks %s -DCPP14_AND_EARLIER -fcxx-exceptions
 
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks %s -fcxx-exceptions -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 -verify -fsyntax-only -fblocks %s -fcxx-exceptions -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -fcxx-exceptions -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++14 -verify -fsyntax-only -fblocks %s -DCPP14_AND_EARLIER -fcxx-exceptions -fexperimental-new-constant-interpreter
 
 namespace test_lambda_is_literal {
 #ifdef CPP14_AND_EARLIER
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 3a3dc8855d827..6987d0c020457 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1290,3 +1290,60 @@ void f() {
 
 
 }
+
+namespace GH147121 {
+struct X {};
+struct S1 {
+    bool operator==(this auto &&, const X &); // #S1-cand
+};
+struct S2 {
+    bool operator==(this X, const auto &&); // #S2-cand
+};
+
+struct S3 {
+    S3& operator++(this X); // #S3-inc-cand
+    S3& operator++(this int); // #S3-inc-cand
+    int operator[](this X); // #S3-sub-cand
+    int operator[](this int); // #S3-sub-cand2
+    void f(this X); // #S3-f-cand
+    void f(this int); // #S3-f-cand2
+};
+
+int main() {
+    S1{} == S1{};
+    // expected-error@-1 {{invalid operands to binary expression ('S1' and 'S1')}}
+    // expected-note@#S1-cand {{candidate function template not viable}}
+    // expected-note@#S1-cand {{candidate function (with reversed parameter order) template not viable}}
+
+
+    S1{} != S1{};
+    // expected-error@-1 {{invalid operands to binary expression ('S1' and 'S1')}}
+    // expected-note@#S1-cand {{candidate function template not viable}}
+    // expected-note@#S1-cand {{candidate function (with reversed parameter order) template not viable}}
+
+
+    S2{} == S2{};
+    // expected-error@-1 {{invalid operands to binary expression ('S2' and 'S2')}}
+    // expected-note@#S2-cand {{candidate function template not viable}}
+    // expected-note@#S2-cand {{candidate function (with reversed parameter order) template not viable}}
+
+
+    S2{} != S2{};
+    // expected-error@-1 {{invalid operands to binary expression ('S2' and 'S2')}}
+    // expected-note@#S2-cand {{candidate function template not viable}}
+    // expected-note@#S2-cand {{candidate function (with reversed parameter order) template not viable}}
+
+    S3 s3;
+    ++s3;
+    // expected-error@-1{{cannot increment value of type 'S3'}}
+    s3[];
+    // expected-error@-1{{no viable overloaded operator[] for type 'S3'}}
+    // expected-note@#S3-sub-cand {{candidate function not viable: no known conversion from 'S3' to 'X' for object argument}}
+    // expected-note@#S3-sub-cand2 {{candidate function not viable: no known conversion from 'S3' to 'int' for object argument}}
+
+    s3.f();
+    // expected-error@-1{{no matching member function for call to 'f'}}
+    // expected-note@#S3-f-cand {{candidate function not viable: no known conversion from 'S3' to 'X' for object argument}}
+    // expected-note@#S3-f-cand2 {{candidate function not viable: no known conversion from 'S3' to 'int' for object argument}}
+}
+}
diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp
index d7b7923430aff..0ce47274979d9 100644
--- a/clang/test/SemaCXX/enum-scoped.cpp
+++ b/clang/test/SemaCXX/enum-scoped.cpp
@@ -349,3 +349,18 @@ enum class B;
 A a;
 B b{a}; // expected-error {{cannot initialize}}
 }
+
+namespace GH147736 {
+template <typename Ty>
+struct S {
+  enum OhBoy : Ty { // expected-error 2 {{'_Atomic' qualifier ignored; operations involving the enumeration type will be non-atomic}}
+    Unimportant
+  } e;
+};
+
+// Okay, was previously rejected. The underlying type is int.
+S<_Atomic(int)> s; // expected-warning {{'_Atomic' is a C11 extension}}
+                   // expected-note@-1 {{in instantiation of template class 'GH147736::S<_Atomic(int)>' requested here}}
+static_assert(__is_same(__underlying_type(S<_Atomic(long long)>::OhBoy), long long), ""); // expected-warning {{'_Atomic' is a C11 extension}}
+                                                                                          // expected-note@-1 {{in instantiation of template class 'GH147736::S<_Atomic(long long)>' requested here}}
+}
diff --git a/clang/test/SemaCXX/uninitialized-multiple-uses.cpp b/clang/test/SemaCXX/uninitialized-multiple-uses.cpp
new file mode 100644
index 0000000000000..a6a4ad39d0be0
--- /dev/null
+++ b/clang/test/SemaCXX/uninitialized-multiple-uses.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fsyntax-only -Wuninitialized -verify %s
+
+void use_val(int);
+void use_const_ref(const int &);
+
+// Test that the warning about self initialization is generated only once.
+void test_self_init_1warning(bool a) {
+  int v = v; // expected-warning {{variable 'v' is uninitialized when used within its own initialization}}
+  if (a)
+    use_val(v);
+  else
+    use_const_ref(v);
+}
+
+// Test that the diagnostic for using an uninitialized variable directly has a
+// higher priority than using the same variable via a const reference.
+void test_prioritize_use_over_const_ref(bool a) {
+  int v; // expected-note {{initialize the variable 'v' to silence this warning}}
+  if (a) // expected-warning {{variable 'v' is used uninitialized whenever 'if' condition is false}}
+         // expected-note@-1 {{remove the 'if' if its condition is always true}}
+    v = 2;
+  else
+    use_const_ref(v);
+  use_val(v); // expected-note {{uninitialized use occurs here}}
+}
diff --git a/clang/test/SemaCXX/warn-uninitialized-const-reference.cpp b/clang/test/SemaCXX/warn-uninitialized-const-reference.cpp
index d24b561441d8f..7204d6525cef9 100644
--- a/clang/test/SemaCXX/warn-uninitialized-const-reference.cpp
+++ b/clang/test/SemaCXX/warn-uninitialized-const-reference.cpp
@@ -27,7 +27,7 @@ int const_use(const int i);
 void f(int a) {
   int i;
   const_ref_use(i);             // expected-warning {{variable 'i' is uninitialized when passed as a const reference argument here}}
-  int j = j + const_ref_use(j); // expected-warning {{variable 'j' is uninitialized when used within its own initialization}} expected-warning {{variable 'j' is uninitialized when passed as a const reference argument here}}
+  int j = j + const_ref_use(j); // expected-warning {{variable 'j' is uninitialized when used within its own initialization}}
   A a1 = const_ref_use_A(a1);   // expected-warning {{variable 'a1' is uninitialized when passed as a const reference argument here}}
   int k = const_use(k);         // expected-warning {{variable 'k' is uninitialized when used within its own initialization}}
   A a2 = const_use_A(a2);       // expected-warning {{variable 'a2' is uninitialized when used within its own initialization}}
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index 477a16a454a9c..d7c6876d3b9e3 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -111,4 +111,8 @@ Buffer<threeDoubles> BufferErr3;
 void main() {
   (void)Buff.__handle; // expected-error {{'__handle' is a private member of 'hlsl::Buffer<vector<float, 3>>'}}
   // expected-note@* {{implicitly declared private here}}
+
+  // expected-error@+2 {{cannot assign to return value because function 'operator[]' returns a const value}}
+  // expected-note@* {{function 'operator[]' which returns const-qualified type 'vector<float const hlsl_device &, 3>' declared here}}
+  Buff[0] = 0.0;
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
index bf541f4a07da7..fbd9288590adc 100644
--- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -28,4 +28,8 @@ StructuredBuffer<Empty> BufferErr4;
 void main() {
   (void)Buff.__handle; // expected-error {{'__handle' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}
   // expected-note@* {{implicitly declared private here}}
+
+  // expected-error@+2 {{cannot assign to return value because function 'operator[]' returns a const value}}
+  // expected-note@* {{function 'operator[]' which returns const-qualified type 'vector<float const hlsl_device &, 3>' declared here}}
+  Buff[0] = 0.0;
 }
diff --git a/clang/test/SemaHLSL/RootSignature-err.hlsl b/clang/test/SemaHLSL/RootSignature-err.hlsl
index 118fc38daf3f2..04013974d28b9 100644
--- a/clang/test/SemaHLSL/RootSignature-err.hlsl
+++ b/clang/test/SemaHLSL/RootSignature-err.hlsl
@@ -34,3 +34,7 @@ void bad_root_signature_5() {}
 // expected-error@+1 {{expected ')' to denote end of parameters, or, another valid parameter of RootConstants}}
 [RootSignature(MultiLineRootSignature)]
 void bad_root_signature_6() {}
+
+// expected-error@+1 {{expected end of stream to denote end of parameters, or, another valid parameter of RootSignature}}
+[RootSignature("RootFlags() RootConstants(b0, num32BitConstants = 1)")]
+void bad_root_signature_7() {}
diff --git a/clang/test/SemaHLSL/RootSignature.hlsl b/clang/test/SemaHLSL/RootSignature.hlsl
new file mode 100644
index 0000000000000..810f81479caab
--- /dev/null
+++ b/clang/test/SemaHLSL/RootSignature.hlsl
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only %s -verify
+
+// expected-no-diagnostics
+
+// Test that we have consistent behaviour for comma parsing. Namely:
+// - a single trailing comma is allowed after any parameter
+// - a trailing comma is not required
+
+[RootSignature("CBV(b0, flags = DATA_VOLATILE,), DescriptorTable(Sampler(s0,),),")]
+void maximum_commas() {}
+
+[RootSignature("CBV(b0, flags = DATA_VOLATILE), DescriptorTable(Sampler(s0))")]
+void minimal_commas() {}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 7494c4f984353..9711b3bdded6b 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -12,6 +12,14 @@ void test_s_monitor_sleep(short a) {
   __builtin_amdgcn_s_monitor_sleep(a); // expected-error {{'__builtin_amdgcn_s_monitor_sleep' must be a constant integer}}
 }
 
+void test_s_wait_asynccnt(short a) {
+  __builtin_amdgcn_s_wait_asynccnt(a); // expected-error {{'__builtin_amdgcn_s_wait_asynccnt' must be a constant integer}}
+}
+
+void test_s_wait_tensorcnt(short a) {
+  __builtin_amdgcn_s_wait_tensorcnt(a); // expected-error {{'__builtin_amdgcn_s_wait_tensorcnt' must be a constant integer}}
+}
+
 void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
   __builtin_amdgcn_cvt_f16_fp8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_fp8' must be a constant integer}}
 }
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 0f1fa8b329fd6..9d34b62da20f5 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1302,6 +1302,9 @@ getDeviceInput(const ArgList &Args) {
   // after every regular input file so that libraries may be included out of
   // order. This follows 'ld.lld' semantics which are more lenient.
   bool Extracted = true;
+  llvm::DenseSet<StringRef> ShouldExtract;
+  for (auto &Arg : Args.getAllArgValues(OPT_should_extract))
+    ShouldExtract.insert(Arg);
   while (Extracted) {
     Extracted = false;
     for (OffloadFile &Binary : ArchiveFilesToExtract) {
@@ -1315,8 +1318,9 @@ getDeviceInput(const ArgList &Args) {
           CompatibleTargets.emplace_back(ID);
 
       for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
-        // Only extract an if we have an an object matching this target.
-        if (!InputFiles.count(ID))
+        // Only extract an if we have an an object matching this target or it
+        // was specifically requested.
+        if (!InputFiles.count(ID) && !ShouldExtract.contains(ID.second))
           continue;
 
         Expected<bool> ExtractOrErr =
@@ -1330,7 +1334,7 @@ getDeviceInput(const ArgList &Args) {
 
         // Skip including the file if it is an archive that does not resolve
         // any symbols.
-        if (!Extracted)
+        if (!Extracted && !ShouldExtract.contains(ID.second))
           continue;
 
         // If another target needs this binary it must be copied instead.
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 17fb9db35fe39..fa73e02fd5178 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -59,6 +59,10 @@ def override_image : Joined<["--"], "override-image=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
   HelpText<"Uses the provided file as if it were the output of the device link step">;
 
+def should_extract : CommaJoined<["--"], "should-extract=">,
+  Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
+  HelpText<"Set of device architectures we should always extract if found.">;
+
 // Flags passed to the device linker.
 def arch_EQ : Joined<["--"], "arch=">,
   Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 8b590bd57e1a3..f10b73278381b 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -94,6 +94,7 @@ static bool DeprecatedDriverCommand;
 static ResourceDirRecipeKind ResourceDirRecipe;
 static bool Verbose;
 static bool PrintTiming;
+static bool EmitVisibleModules;
 static llvm::BumpPtrAllocator Alloc;
 static llvm::StringSaver Saver{Alloc};
 static std::vector<const char *> CommandLine;
@@ -232,6 +233,8 @@ static void ParseArgs(int argc, char **argv) {
 
   PrintTiming = Args.hasArg(OPT_print_timing);
 
+  EmitVisibleModules = Args.hasArg(OPT_emit_visible_modules);
+
   Verbose = Args.hasArg(OPT_verbose);
 
   RoundTripArgs = Args.hasArg(OPT_round_trip_args);
@@ -380,6 +383,14 @@ static auto toJSONSorted(llvm::json::OStream &JOS,
   };
 }
 
+static auto toJSONSorted(llvm::json::OStream &JOS, std::vector<std::string> V) {
+  llvm::sort(V);
+  return [&JOS, V = std::move(V)] {
+    for (const StringRef Entry : V)
+      JOS.value(Entry);
+  };
+}
+
 // Thread safe.
 class FullDeps {
 public:
@@ -396,6 +407,7 @@ class FullDeps {
     ID.NamedModule = std::move(TUDeps.ID.ModuleName);
     ID.NamedModuleDeps = std::move(TUDeps.NamedModuleDeps);
     ID.ClangModuleDeps = std::move(TUDeps.ClangModuleDeps);
+    ID.VisibleModules = std::move(TUDeps.VisibleModules);
     ID.DriverCommandLine = std::move(TUDeps.DriverCommandLine);
     ID.Commands = std::move(TUDeps.Commands);
 
@@ -525,6 +537,9 @@ class FullDeps {
                     JOS.attributeArray("file-deps",
                                        toJSONStrings(JOS, I.FileDeps));
                     JOS.attribute("input-file", StringRef(I.FileName));
+                    if (EmitVisibleModules)
+                      JOS.attributeArray("visible-clang-modules",
+                                         toJSONSorted(JOS, I.VisibleModules));
                   });
                 }
               } else {
@@ -545,6 +560,9 @@ class FullDeps {
                   JOS.attributeArray("file-deps",
                                      toJSONStrings(JOS, I.FileDeps));
                   JOS.attribute("input-file", StringRef(I.FileName));
+                  if (EmitVisibleModules)
+                    JOS.attributeArray("visible-clang-modules",
+                                       toJSONSorted(JOS, I.VisibleModules));
                 });
               }
             });
@@ -596,6 +614,7 @@ class FullDeps {
     std::string NamedModule;
     std::vector<std::string> NamedModuleDeps;
     std::vector<ModuleID> ClangModuleDeps;
+    std::vector<std::string> VisibleModules;
     std::vector<std::string> DriverCommandLine;
     std::vector<Command> Commands;
   };
@@ -623,11 +642,12 @@ static bool handleTranslationUnitResult(
   return false;
 }
 
-static bool handleModuleResult(
-    StringRef ModuleName, llvm::Expected<ModuleDepsGraph> &MaybeModuleGraph,
-    FullDeps &FD, size_t InputIndex, SharedStream &OS, SharedStream &Errs) {
-  if (!MaybeModuleGraph) {
-    llvm::handleAllErrors(MaybeModuleGraph.takeError(),
+static bool handleModuleResult(StringRef ModuleName,
+                               llvm::Expected<TranslationUnitDeps> &MaybeTUDeps,
+                               FullDeps &FD, size_t InputIndex,
+                               SharedStream &OS, SharedStream &Errs) {
+  if (!MaybeTUDeps) {
+    llvm::handleAllErrors(MaybeTUDeps.takeError(),
                           [&ModuleName, &Errs](llvm::StringError &Err) {
                             Errs.applyLocked([&](raw_ostream &OS) {
                               OS << "Error while scanning dependencies for "
@@ -637,7 +657,7 @@ static bool handleModuleResult(
                           });
     return true;
   }
-  FD.mergeDeps(std::move(*MaybeModuleGraph), InputIndex);
+  FD.mergeDeps(std::move(MaybeTUDeps->ModuleGraph), InputIndex);
   return false;
 }
 
diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
index 9cccbb3aaf0c8..03011f9ae1f75 100644
--- a/clang/tools/clang-scan-deps/Opts.td
+++ b/clang/tools/clang-scan-deps/Opts.td
@@ -37,6 +37,9 @@ defm resource_dir_recipe : Eq<"resource-dir-recipe", "How to produce missing '-r
 
 def print_timing : F<"print-timing", "Print timing information">;
 
+def emit_visible_modules
+    : F<"emit-visible-modules", "emit visible modules in primary output">;
+
 def verbose : F<"v", "Use verbose output">;
 
 def round_trip_args : F<"round-trip-args", "verify that command-line arguments are canonical by parsing and re-serializing">;
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index a16fbffb76270..88707551b7698 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -747,16 +747,14 @@ TEST_F(FormatTestComments, DontSplitLineCommentsWithEscapedNewlines) {
                    "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
                    "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
                    getLLVMStyleWithColumns(50)));
-  // FIXME: One day we might want to implement adjustment of leading whitespace
-  // of the consecutive lines in this kind of comment:
-  EXPECT_EQ("double\n"
-            "    a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-            "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-            "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-            format("double a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-                   "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
-                   "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-                   getLLVMStyleWithColumns(49)));
+  verifyFormat("double\n"
+               "    a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+               "       // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+               "double a; // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+               "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\\\n"
+               "          // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+               getLLVMStyleWithColumns(49));
 }
 
 TEST_F(FormatTestComments, DontIntroduceMultilineComments) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index a1285e4bc9bf8..e281a4945a862 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -4126,6 +4126,13 @@ TEST_F(TokenAnnotatorTest, JsonCodeInRawString) {
   EXPECT_TOKEN(Tokens[6], tok::colon, TT_DictLiteral);
 }
 
+TEST_F(TokenAnnotatorTest, LineCommentTrailingBackslash) {
+  auto Tokens = annotate("// a \\\n"
+                         "// b");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::comment, TT_LineComment);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
index ff1697f1bbb9a..e82dcadebba3f 100644
--- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
+++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
@@ -1238,4 +1238,166 @@ TEST_F(ParseHLSLRootSignatureTest, InvalidNonZeroFlagsTest) {
   ASSERT_TRUE(Consumer->isSatisfied());
 }
 
+TEST_F(ParseHLSLRootSignatureTest, InvalidRootElementMissingCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    RootFlags()
+    RootConstants(num32BitConstants = 1, b0)
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
+TEST_F(ParseHLSLRootSignatureTest, InvalidDescriptorTableMissingCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    DescriptorTable(
+      CBV(b0)
+      visibility = SHADER_VISIBILITY_ALL
+    )
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
+TEST_F(ParseHLSLRootSignatureTest, InvalidRootConstantParamsCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    RootConstants(
+      num32BitConstants = 1
+      b0
+    )
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
+TEST_F(ParseHLSLRootSignatureTest, InvalidRootDescriptorParamsCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    CBV(
+      b0
+      flags = 0
+    )
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
+TEST_F(ParseHLSLRootSignatureTest, InvalidDescriptorClauseParamsCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    DescriptorTable(
+      UAV(
+        u0
+        flags = 0
+      )
+    )
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
+TEST_F(ParseHLSLRootSignatureTest, InvalidStaticSamplerCommaTest) {
+  // This test will check that an error is produced when there is a missing
+  // comma between parameters
+  const llvm::StringLiteral Source = R"cc(
+    StaticSampler(
+      s0
+      maxLOD = 3
+    )
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  SmallVector<RootElement> Elements;
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Elements,
+                                   Signature, *PP);
+
+  // Test correct diagnostic produced
+  Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params);
+  ASSERT_TRUE(Parser.parse());
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
 } // anonymous namespace
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
index b461d9109271c..023c02ddaa3e4 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
@@ -233,3 +233,34 @@ TEST(DependencyScanningFilesystem, DiagnoseCachedFileSizeChange) {
   ASSERT_EQ(SizeInfo->CachedSize, 0u);
   ASSERT_EQ(SizeInfo->ActualSize, 8u);
 }
+
+TEST(DependencyScanningFilesystem, DoNotDiagnoseDirSizeChange) {
+  llvm::SmallString<128> Dir;
+  ASSERT_FALSE(llvm::sys::fs::createUniqueDirectory("tmp", Dir));
+
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS =
+      llvm::vfs::createPhysicalFileSystem();
+
+  DependencyScanningFilesystemSharedCache SharedCache;
+  DependencyScanningWorkerFilesystem DepFS(SharedCache, FS);
+
+  // Trigger the file system cache.
+  ASSERT_EQ(DepFS.exists(Dir), true);
+
+  // Add a file to the FS to change its size.
+  // It seems that directory sizes reported are not meaningful,
+  // and should not be used to check for size changes.
+  // This test is setup only to trigger a size change so that we
+  // know we are excluding directories from reporting.
+  llvm::SmallString<128> FilePath = Dir;
+  llvm::sys::path::append(FilePath, "file.h");
+  {
+    std::error_code EC;
+    llvm::raw_fd_ostream TempFile(FilePath, EC);
+    ASSERT_FALSE(EC);
+  }
+
+  // We do not report directory size changes.
+  auto InvalidEntries = SharedCache.getOutOfDateEntries(*FS);
+  EXPECT_EQ(InvalidEntries.size(), 0u);
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 409f1c4f71834..d4fb56e6a39b7 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -2417,7 +2417,11 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#ifndef __ARM_NEON_H\n";
   OS << "#define __ARM_NEON_H\n\n";
 
-  OS << "#ifndef __ARM_FP\n";
+  OS << "#if !defined(__arm__) && !defined(__aarch64__) && "
+        "!defined(__arm64ec__)\n";
+  OS << "#error \"<arm_neon.h> is intended only for ARM and AArch64 "
+        "targets\"\n";
+  OS << "#elif !defined(__ARM_FP)\n";
   OS << "#error \"NEON intrinsics not available with the soft-float ABI. "
         "Please use -mfloat-abi=softfp or -mfloat-abi=hard\"\n";
   OS << "#else\n\n";
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index fb2aee8e42ee2..9a0426ff29470 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -162,9 +162,7 @@ endmacro()
 #                         OBJECT_LIBS <object libraries to use as sources>
 #                         PARENT_TARGET <convenience parent target>
 #                         ADDITIONAL_HEADERS <header files>
-#                         EXTENSIONS <boolean>
-#                         C_STANDARD <version>
-#                         CXX_STANDARD <version>)
+#                         EXTENSIONS <boolean>)
 function(add_compiler_rt_runtime name type)
   if(NOT type MATCHES "^(OBJECT|STATIC|SHARED|MODULE)$")
     message(FATAL_ERROR
@@ -173,7 +171,7 @@ function(add_compiler_rt_runtime name type)
   endif()
   cmake_parse_arguments(LIB
     ""
-    "PARENT_TARGET;C_STANDARD;CXX_STANDARD"
+    "PARENT_TARGET"
     "OS;ARCHS;SOURCES;CFLAGS;LINK_FLAGS;DEFS;DEPS;LINK_LIBS;OBJECT_LIBS;ADDITIONAL_HEADERS;EXTENSIONS"
     ${ARGN})
   set(libnames)
@@ -362,12 +360,6 @@ function(add_compiler_rt_runtime name type)
       set_target_link_flags(${libname} ${extra_link_flags_${libname}})
       set_property(TARGET ${libname} APPEND PROPERTY
                    COMPILE_DEFINITIONS ${LIB_DEFS})
-      if(LIB_C_STANDARD)
-        set_property(TARGET ${libname} PROPERTY C_STANDARD ${LIB_C_STANDARD})
-      endif()
-      if(LIB_CXX_STANDARD)
-        set_property(TARGET ${libname} PROPERTY CXX_STANDARD ${LIB_CXX_STANDARD})
-      endif()
       set_target_output_directories(${libname} ${output_dir_${libname}})
       install(TARGETS ${libname}
         ARCHIVE DESTINATION ${install_dir_${libname}}
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b86bb1bca7cda..c62855835512d 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -26,7 +26,6 @@ builtin_check_c_compiler_flag("-Xclang -mcode-object-version=none" COMPILER_RT_H
 builtin_check_c_compiler_flag(-Wbuiltin-declaration-mismatch COMPILER_RT_HAS_WBUILTIN_DECLARATION_MISMATCH_FLAG)
 builtin_check_c_compiler_flag(/Zl COMPILER_RT_HAS_ZL_FLAG)
 builtin_check_c_compiler_flag(-fcf-protection=full COMPILER_RT_HAS_FCF_PROTECTION_FLAG)
-builtin_check_c_compiler_flag(-nostdinc++          COMPILER_RT_HAS_NOSTDINCXX_FLAG)
 
 builtin_check_c_compiler_source(COMPILER_RT_HAS_ATOMIC_KEYWORD
 "
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 3ab92403d4168..5e832315f3666 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -6,7 +6,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   cmake_minimum_required(VERSION 3.20.0)
 
   set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
-  project(CompilerRTBuiltins C CXX ASM)
+  project(CompilerRTBuiltins C ASM)
   set(COMPILER_RT_STANDALONE_BUILD TRUE)
   set(COMPILER_RT_BUILTINS_STANDALONE_BUILD TRUE)
 
@@ -64,8 +64,6 @@ include(CMakePushCheckState)
 option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS
   "Do not export any symbols from the static library." ON)
 
-include_directories(../../../third-party/siphash/include)
-
 # TODO: Need to add a mechanism for logging errors when builtin source files are
 # added to a sub-directory and not this CMakeLists file.
 set(GENERIC_SOURCES
@@ -591,7 +589,6 @@ set(aarch64_SOURCES
   ${GENERIC_TF_SOURCES}
   ${GENERIC_SOURCES}
   cpu_model/aarch64.c
-  aarch64/emupac.cpp
   aarch64/fp_mode.c
 )
 
@@ -839,7 +836,7 @@ else ()
     append_list_if(COMPILER_RT_ENABLE_CET -fcf-protection=full BUILTIN_CFLAGS)
   endif()
 
-  append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ BUILTIN_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 BUILTIN_CFLAGS)
   append_list_if(COMPILER_RT_HAS_WBUILTIN_DECLARATION_MISMATCH_FLAG -Werror=builtin-declaration-mismatch BUILTIN_CFLAGS)
 
   # Don't embed directives for picking any specific CRT
@@ -961,8 +958,6 @@ else ()
                               SOURCES ${${arch}_SOURCES}
                               DEFS ${BUILTIN_DEFS}
                               CFLAGS ${BUILTIN_CFLAGS_${arch}}
-                              C_STANDARD 11
-                              CXX_STANDARD 17
                               PARENT_TARGET builtins)
       cmake_pop_check_state()
     endif ()
diff --git a/compiler-rt/lib/builtins/aarch64/emupac.cpp b/compiler-rt/lib/builtins/aarch64/emupac.cpp
deleted file mode 100644
index 4e28667718754..0000000000000
--- a/compiler-rt/lib/builtins/aarch64/emupac.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===--- emupac.cpp - Emulated PAC implementation -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements Emulated PAC using SipHash_1_3 as the IMPDEF hashing
-//  scheme.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-#include "siphash/SipHash.h"
-
-// EmuPAC implements runtime emulation of PAC instructions. If the current
-// CPU supports PAC, EmuPAC uses real PAC instructions. Otherwise, it uses the
-// emulation, which is effectively an implementation of PAC with an IMPDEF
-// hashing scheme based on SipHash_1_3.
-//
-// The purpose of the emulation is to allow programs to be built to be portable
-// to machines without PAC support, with some performance loss and increased
-// probability of false positives (due to not being able to portably determine
-// the VA size), while being functionally almost equivalent to running on a
-// machine with PAC support. One example of a use case is if PAC is used in
-// production as a security mitigation, but the testing environment is
-// heterogeneous (i.e. some machines lack PAC support). In this case we would
-// like the testing machines to be able to detect issues resulting
-// from the use of PAC instructions that would affect production by running
-// tests. This can be achieved by building test binaries with EmuPAC and
-// production binaries with real PAC.
-//
-// EmuPAC should not be used in production and is only intended for testing use
-// cases. This is not only because of the performance costs, which will exist
-// even on PAC-supporting machines because of the function call overhead for
-// each sign/auth operation, but because it provides weaker security compared to
-// real PAC: the key is constant and public, which means that we do not mix a
-// global secret.
-//
-// The emulation assumes that the VA size is at most 48 bits. The architecture
-// as of ARMv8.2, which was the last architecture version in which PAC was not
-// mandatory, permitted VA size up to 52 bits via ARMv8.2-LVA, but we are
-// unaware of an ARMv8.2 CPU that implemented ARMv8.2-LVA.
-
-const uint64_t max_va_size = 48;
-const uint64_t pac_mask = ((1ULL << 55) - 1) & ~((1ULL << max_va_size) - 1);
-const uint64_t ttbr1_mask = 1ULL << 55;
-
-// Determine whether PAC is supported without accessing memory. This utilizes
-// the XPACLRI instruction which will copy bit 55 of x30 into at least bit 54 if
-// PAC is supported and acts as a NOP if PAC is not supported.
-static bool pac_supported() {
-  register uintptr_t x30 __asm__("x30") = 1ULL << 55;
-  __asm__ __volatile__("xpaclri" : "+r"(x30));
-  return x30 & (1ULL << 54);
-}
-
-// This asm snippet is used to force the creation of a frame record when
-// calling the EmuPAC functions. This is important because the EmuPAC functions
-// may crash if an auth failure is detected and may be unwound past using a
-// frame pointer based unwinder.
-#ifdef __GCC_HAVE_DWARF2_CFI_ASM
-#define CFI_INST(inst) inst
-#else
-#define CFI_INST(inst)
-#endif
-
-// clang-format off
-#define FRAME_POINTER_WRAP(sym) \
-  CFI_INST(".cfi_startproc\n") \
-  "stp x29, x30, [sp, #-16]!\n" \
-  CFI_INST(".cfi_def_cfa_offset 16\n") \
-  "mov x29, sp\n" \
-  CFI_INST(".cfi_def_cfa w29, 16\n") \
-  CFI_INST(".cfi_offset w30, -8\n") \
-  CFI_INST(".cfi_offset w29, -16\n") \
-  "bl " #sym "\n" \
-  CFI_INST(".cfi_def_cfa wsp, 16\n") \
-  "ldp x29, x30, [sp], #16\n" \
-  CFI_INST(".cfi_def_cfa_offset 0\n") \
-  CFI_INST(".cfi_restore w30\n") \
-  CFI_INST(".cfi_restore w29\n") \
-  "ret\n" \
-  CFI_INST(".cfi_endproc\n")
-// clang-format on
-
-// Emulated DA key value.
-static const uint8_t emu_da_key[16] = {0xb5, 0xd4, 0xc9, 0xeb, 0x79, 0x10,
-                                       0x4a, 0x79, 0x6f, 0xec, 0x8b, 0x1b,
-                                       0x42, 0x87, 0x81, 0xd4};
-
-extern "C" [[gnu::flatten]] uint64_t
-__emupac_pacda_impl(uint64_t ptr, uint64_t disc) {
-  if (pac_supported()) {
-    __asm__ __volatile__(".arch_extension pauth\npacda %0, %1"
-                         : "+r"(ptr)
-                         : "r"(disc));
-    return ptr;
-  }
-  if (ptr & ttbr1_mask) {
-    if ((ptr & pac_mask) != pac_mask) {
-      return ptr | pac_mask;
-    }
-  } else {
-    if (ptr & pac_mask) {
-      return ptr & ~pac_mask;
-    }
-  }
-  uint64_t hash;
-  siphash<1, 3>(reinterpret_cast<uint8_t *>(&ptr), 8, emu_da_key,
-                *reinterpret_cast<uint8_t(*)[8]>(&hash));
-  return (ptr & ~pac_mask) | (hash & pac_mask);
-}
-
-__asm__(".globl __emupac_pacda\n"
-        "__emupac_pacda:\n" FRAME_POINTER_WRAP(__emupac_pacda_impl));
-
-extern "C" [[gnu::flatten]] uint64_t
-__emupac_autda_impl(uint64_t ptr, uint64_t disc) {
-  if (pac_supported()) {
-    __asm__ __volatile__(".arch_extension pauth\nautda %0, %1"
-                         : "+r"(ptr)
-                         : "r"(disc));
-    return ptr;
-  }
-  uint64_t ptr_without_pac =
-      (ptr & ttbr1_mask) ? (ptr | pac_mask) : (ptr & ~pac_mask);
-  uint64_t hash;
-  siphash<1, 3>(reinterpret_cast<uint8_t *>(&ptr_without_pac), 8, emu_da_key,
-                *reinterpret_cast<uint8_t(*)[8]>(&hash));
-  if (((ptr & ~pac_mask) | (hash & pac_mask)) != ptr) {
-    __builtin_trap();
-  }
-  return ptr_without_pac;
-}
-
-__asm__(".globl __emupac_autda\n"
-        "__emupac_autda:\n" FRAME_POINTER_WRAP(__emupac_autda_impl));
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 7c7f8cb64aa9a..48862f3642175 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -223,7 +223,7 @@ typedef union {
 #define CRT_HAS_TF_MODE
 #endif
 
-#if __STDC_VERSION__ >= 199901L && !defined(_MSC_VER)
+#if __STDC_VERSION__ >= 199901L
 typedef float _Complex Fcomplex;
 typedef double _Complex Dcomplex;
 typedef long double _Complex Lcomplex;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
index 4940062eeae47..4c1e005289230 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
@@ -29,6 +29,7 @@
 #include <sys/mtio.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
+#include <sys/shm.h>
 #include <sys/signal.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
@@ -87,10 +88,6 @@
 #include <wchar.h>
 #include <wordexp.h>
 
-#define _KERNEL  // to declare 'shminfo' structure
-#include <sys/shm.h>
-#undef _KERNEL
-
 #undef IOC_DIRMASK
 
 // Include these after system headers to avoid name clashes and ambiguities.
@@ -141,8 +138,6 @@ unsigned struct_timeb_sz = sizeof(struct timeb);
 unsigned struct_msqid_ds_sz = sizeof(struct msqid_ds);
 unsigned struct_mq_attr_sz = sizeof(struct mq_attr);
 unsigned struct_statvfs_sz = sizeof(struct statvfs);
-unsigned struct_shminfo_sz = sizeof(struct shminfo);
-unsigned struct_shm_info_sz = sizeof(struct shm_info);
 unsigned struct_regmatch_sz = sizeof(regmatch_t);
 unsigned struct_regex_sz = sizeof(regex_t);
 unsigned struct_fstab_sz = sizeof(struct fstab);
@@ -156,9 +151,6 @@ const uptr sig_err = (uptr)SIG_ERR;
 const uptr sa_siginfo = (uptr)SA_SIGINFO;
 
 int shmctl_ipc_stat = (int)IPC_STAT;
-int shmctl_ipc_info = (int)IPC_INFO;
-int shmctl_shm_info = (int)SHM_INFO;
-int shmctl_shm_stat = (int)SHM_STAT;
 unsigned struct_utmpx_sz = sizeof(struct utmpx);
 
 int map_fixed = MAP_FIXED;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
index 8ce73f206fd88..382b67ce78ebd 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
@@ -419,12 +419,14 @@ struct __sanitizer_wordexp_t {
 
 typedef void __sanitizer_FILE;
 
-extern unsigned struct_shminfo_sz;
-extern unsigned struct_shm_info_sz;
 extern int shmctl_ipc_stat;
-extern int shmctl_ipc_info;
-extern int shmctl_shm_info;
-extern int shmctl_shm_stat;
+
+// This simplifies generic code
+#define struct_shminfo_sz -1
+#define struct_shm_info_sz -1
+#define shmctl_shm_stat -1
+#define shmctl_ipc_info -1
+#define shmctl_shm_info -1
 
 extern unsigned struct_utmpx_sz;
 
diff --git a/compiler-rt/test/builtins/Unit/aarch64/emupac.c b/compiler-rt/test/builtins/Unit/aarch64/emupac.c
deleted file mode 100644
index 60ad9444801d8..0000000000000
--- a/compiler-rt/test/builtins/Unit/aarch64/emupac.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// REQUIRES: librt_has_emupac
-// RUN: %clang_builtins %s %librt -o %t
-// RUN: %run %t 1
-// RUN: %run %t 2
-// RUN: %expect_crash %run %t 3
-// RUN: %expect_crash %run %t 4
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-uint64_t __emupac_pacda(uint64_t ptr, uint64_t disc);
-uint64_t __emupac_autda(uint64_t ptr, uint64_t disc);
-
-int main(int argc, char **argv) {
-  char stack_object1;
-  uint64_t ptr1 = (uint64_t)&stack_object1;
-
-  char stack_object2;
-  uint64_t ptr2 = (uint64_t)&stack_object2;
-
-  switch (atoi(argv[1])) {
-  case 1: {
-    // Normal case: test that a pointer authenticated with the same
-    // discriminator is equal to the original pointer.
-    uint64_t signed_ptr = __emupac_pacda(ptr1, ptr2);
-    uint64_t authed_ptr = __emupac_autda(signed_ptr, ptr2);
-    if (authed_ptr != ptr1) {
-      printf("0x%lx != 0x%lx\n", authed_ptr, ptr1);
-      return 1;
-    }
-    break;
-  }
-  case 2: {
-    // Test that negative addresses (addresses controlled by TTBR1,
-    // conventionally kernel addresses) can be signed and authenticated.
-    uint64_t unsigned_ptr = -1ULL;
-    uint64_t signed_ptr = __emupac_pacda(unsigned_ptr, ptr2);
-    uint64_t authed_ptr = __emupac_autda(signed_ptr, ptr2);
-    if (authed_ptr != unsigned_ptr) {
-      printf("0x%lx != 0x%lx\n", authed_ptr, unsigned_ptr);
-      return 1;
-    }
-    break;
-  }
-  case 3: {
-    // Test that a corrupted signature crashes the program.
-    uint64_t signed_ptr = __emupac_pacda(ptr1, ptr2);
-    __emupac_autda(signed_ptr + (1ULL << 48), ptr2);
-    break;
-  }
-  case 4: {
-    // Test that signing a pointer with signature bits already set produces a pointer
-    // that would fail auth.
-    uint64_t signed_ptr = __emupac_pacda(ptr1 + (1ULL << 48), ptr2);
-    __emupac_autda(signed_ptr, ptr2);
-    break;
-  }
-  }
-
-  return 0;
-}
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index b2dbbcb5630f4..dfaa2eb6e03fe 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -25,6 +25,8 @@ page](https://llvm.org/releases/).
 ## Major New Features
 
 * Initial support for VOLATILE variables and procedure interface arguments has been added.
+* OpenMP support is stable and no longer considered experimental. All of OpenMP 3.1 is
+  supported, along with much of OpenMP 4.0 and some parts of later standards.
 
 ## Bug Fixes
 
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index a36b8719e365d..e9aeed18ab0b7 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -472,7 +472,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpIteration)
   READ_FEATURE(OmpIterationOffset)
   READ_FEATURE(OmpIterationVector)
-  READ_FEATURE(OmpEndAllocators)
   READ_FEATURE(OmpEndBlockDirective)
   READ_FEATURE(OmpEndCriticalDirective)
   READ_FEATURE(OmpEndLoopDirective)
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
similarity index 99%
rename from flang/lib/Lower/OpenMP/Clauses.h
rename to flang/include/flang/Lower/OpenMP/Clauses.h
index d7ab21d428e32..7f317f05f67b7 100644
--- a/flang/lib/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -179,6 +179,7 @@ using IteratorSpecifier = tomp::type::IteratorSpecifierT<TypeTy, IdTy, ExprTy>;
 using DefinedOperator = tomp::type::DefinedOperatorT<IdTy, ExprTy>;
 using ProcedureDesignator = tomp::type::ProcedureDesignatorT<IdTy, ExprTy>;
 using ReductionOperator = tomp::type::ReductionIdentifierT<IdTy, ExprTy>;
+using ReductionOperatorList = List<ReductionOperator>;
 using DependenceType = tomp::type::DependenceType;
 using Prescriptiveness = tomp::type::Prescriptiveness;
 
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
similarity index 85%
rename from flang/lib/Lower/OpenMP/ReductionProcessor.h
rename to flang/include/flang/Lower/Support/ReductionProcessor.h
index a7198b48f6b4e..72d8a0096f511 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -13,13 +13,12 @@
 #ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 
-#include "Clauses.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/type.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Types.h"
 
@@ -65,6 +64,9 @@ class ReductionProcessor {
   static ReductionIdentifier
   getReductionType(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp);
 
+  static ReductionIdentifier
+  getReductionType(const fir::ReduceOperationEnum &pd);
+
   static bool
   supportedIntrinsicProcReduction(const omp::clause::ProcedureDesignator &pd);
 
@@ -78,10 +80,9 @@ class ReductionProcessor {
                                       const fir::KindMapping &kindMap,
                                       mlir::Type ty, bool isByRef);
 
-  static std::string
-  getReductionName(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp,
-                   const fir::KindMapping &kindMap, mlir::Type ty,
-                   bool isByRef);
+  static std::string getReductionName(ReductionIdentifier redId,
+                                      const fir::KindMapping &kindMap,
+                                      mlir::Type ty, bool isByRef);
 
   /// This function returns the identity value of the operator \p
   /// reductionOpName. For example:
@@ -113,22 +114,23 @@ class ReductionProcessor {
   /// symbol table. The declaration has a constant initializer with the neutral
   /// value `initValue`, and the reduction combiner carried over from `reduce`.
   /// TODO: add atomic region.
-  static mlir::omp::DeclareReductionOp
-  createDeclareReduction(AbstractConverter &builder,
-                         llvm::StringRef reductionOpName,
-                         const ReductionIdentifier redId, mlir::Type type,
-                         mlir::Location loc, bool isByRef);
+  template <typename OpType>
+  static OpType createDeclareReduction(AbstractConverter &builder,
+                                       llvm::StringRef reductionOpName,
+                                       const ReductionIdentifier redId,
+                                       mlir::Type type, mlir::Location loc,
+                                       bool isByRef);
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
-  template <class T>
+  template <typename OpType, typename RedOperatorListTy>
   static void processReductionArguments(
       mlir::Location currentLocation, lower::AbstractConverter &converter,
-      const T &reduction, llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+      const RedOperatorListTy &redOperatorList,
+      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
       llvm::SmallVectorImpl<bool> &reduceVarByRef,
       llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-      llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-      mlir::omp::ReductionModifierAttr *reductionMod = nullptr);
+      const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
 };
 
 template <typename FloatOp, typename IntegerOp>
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 2845080030b92..7bd96ac3ea631 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -112,7 +112,7 @@ def fir_ReduceOperationEnum : I32BitEnumAttr<"ReduceOperationEnum",
       I32BitEnumAttrCaseBit<"MIN", 7, "min">,
       I32BitEnumAttrCaseBit<"IAND", 8, "iand">,
       I32BitEnumAttrCaseBit<"IOR", 9, "ior">,
-      I32BitEnumAttrCaseBit<"EIOR", 10, "eior">
+      I32BitEnumAttrCaseBit<"IEOR", 10, "ieor">
     ]> {
   let separator = ", ";
   let cppNamespace = "::fir";
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index f440580f0878a..e3f5c4403002a 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3518,7 +3518,7 @@ def fir_BoxTotalElementsOp
 
 def YieldOp : fir_Op<"yield",
     [Pure, ReturnLike, Terminator,
-     ParentOneOf<["LocalitySpecifierOp"]>]> {
+     ParentOneOf<["LocalitySpecifierOp", "DeclareReductionOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "fir.yield" yields SSA values from a fir dialect op region and
@@ -3656,6 +3656,103 @@ def fir_LocalitySpecifierOp : fir_Op<"local", [IsolatedFromAbove]> {
   let hasRegionVerifier = 1;
 }
 
+def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove,
+                                                         Symbol]> {
+  let summary = "declares a reduction kind";
+  let description = [{
+    Note: this operation is adapted from omp::DeclareReductionOp. There is a lot
+    duplication at the moment. TODO Combine both ops into one. See:
+    https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108.
+
+    Declares a `do concurrent` reduction. This requires two mandatory and three
+    optional regions.
+
+      1. The optional alloc region specifies how to allocate the thread-local
+         reduction value. This region should not contain control flow and all
+         IR should be suitable for inlining straight into an entry block. In
+         the common case this is expected to contain only allocas. It is
+         expected to `fir.yield` the allocated value on all control paths.
+         If allocation is conditional (e.g. only allocate if the mold is
+         allocated), this should be done in the initilizer region and this
+         region not included. The alloc region is not used for by-value
+         reductions (where allocation is implicit).
+      2. The initializer region specifies how to initialize the thread-local
+         reduction value. This is usually the neutral element of the reduction.
+         For convenience, the region has an argument that contains the value
+         of the reduction accumulator at the start of the reduction. If an alloc
+         region is specified, there is a second block argument containing the
+         address of the allocated memory. The initializer region is expected to
+         `fir.yield` the new value on all control flow paths.
+      3. The reduction region specifies how to combine two values into one, i.e.
+         the reduction operator. It accepts the two values as arguments and is
+         expected to `fir.yield` the combined value on all control flow paths.
+      4. The atomic reduction region is optional and specifies how two values
+         can be combined atomically given local accumulator variables. It is
+         expected to store the combined value in the first accumulator variable.
+      5. The cleanup region is optional and specifies how to clean up any memory
+         allocated by the initializer region. The region has an argument that
+         contains the value of the thread-local reduction accumulator. This will
+         be executed after the reduction has completed.
+
+    Note that the MLIR type system does not allow for type-polymorphic
+    reductions. Separate reduction declarations should be created for different
+    element and accumulator types.
+
+    For initializer and reduction regions, the operand to `fir.yield` must
+    match the parent operation's results.
+  }];
+
+  let arguments = (ins SymbolNameAttr:$sym_name,
+                       TypeAttr:$type);
+
+  let regions = (region MaxSizedRegion<1>:$allocRegion,
+                        AnyRegion:$initializerRegion,
+                        AnyRegion:$reductionRegion,
+                        AnyRegion:$atomicReductionRegion,
+                        AnyRegion:$cleanupRegion);
+
+  let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword "
+                       "( `alloc` $allocRegion^ )? "
+                       "`init` $initializerRegion "
+                       "`combiner` $reductionRegion "
+                       "( `atomic` $atomicReductionRegion^ )? "
+                       "( `cleanup` $cleanupRegion^ )? ";
+
+  let extraClassDeclaration = [{
+    mlir::BlockArgument getAllocMoldArg() {
+      auto &region = getAllocRegion();
+      return region.empty() ? nullptr : region.getArgument(0);
+    }
+    mlir::BlockArgument getInitializerMoldArg() {
+      return getInitializerRegion().getArgument(0);
+    }
+    mlir::BlockArgument getInitializerAllocArg() {
+      return getAllocRegion().empty() ?
+          nullptr : getInitializerRegion().getArgument(1);
+    }
+    mlir::BlockArgument getReductionLhsArg() {
+      return getReductionRegion().getArgument(0);
+    }
+    mlir::BlockArgument getReductionRhsArg() {
+      return getReductionRegion().getArgument(1);
+    }
+    mlir::BlockArgument getAtomicReductionLhsArg() {
+      auto &region = getAtomicReductionRegion();
+      return region.empty() ? nullptr : region.getArgument(0);
+    }
+    mlir::BlockArgument getAtomicReductionRhsArg() {
+      auto &region = getAtomicReductionRegion();
+      return region.empty() ? nullptr : region.getArgument(1);
+    }
+    mlir::BlockArgument getCleanupAllocArg() {
+      auto &region = getCleanupRegion();
+      return region.empty() ? nullptr : region.getArgument(0);
+    }
+  }];
+
+  let hasRegionVerifier = 1;
+}
+
 def fir_DoConcurrentOp : fir_Op<"do_concurrent",
     [SingleBlock, AutomaticAllocationScope]> {
   let summary = "do concurrent loop wrapper";
@@ -3694,6 +3791,25 @@ def fir_LocalSpecifier {
   );
 }
 
+def fir_ReduceSpecifier {
+  dag arguments = (ins
+    Variadic<AnyType>:$reduce_vars,
+    OptionalAttr<DenseBoolArrayAttr>:$reduce_byref,
+
+    // This introduces redundency in how reductions are modelled. In particular,
+    // a single reduction is represented by 2 attributes:
+    //
+    // 1. `$reduce_syms` which is a list of `DeclareReductionOp`s.
+    // 2. `$reduce_attrs` which is an array of `fir::ReduceAttr` values.
+    //
+    // The first makes it easier to map `do concurrent` to parallization models
+    // (e.g. OpenMP and OpenACC) while the second makes it easier to map it to
+    // nests of `fir.do_loop ... unodered` ops.
+    OptionalAttr<SymbolRefArrayAttr>:$reduce_syms,
+    OptionalAttr<ArrayAttr>:$reduce_attrs
+  );
+}
+
 def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
     [AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface,
                                                          ["getLoopInductionVars"]>,
@@ -3703,7 +3819,7 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
   let description = [{
     An operation that models a Fortran `do concurrent` loop's header and block.
     This is a single-region single-block terminator op that is expected to
-    terminate the region of a `omp.do_concurrent` wrapper op.
+    terminate the region of a `fir.do_concurrent` wrapper op.
 
     This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
     `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
@@ -3741,8 +3857,6 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
     - `lowerBound`: The group of SSA values for the nest's lower bounds.
     - `upperBound`: The group of SSA values for the nest's upper bounds.
     - `step`: The group of SSA values for the nest's steps.
-    - `reduceOperands`: The reduction SSA values, if any.
-    - `reduceAttrs`: Attributes to store reduction operations, if any.
     - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
       LLVM.
   }];
@@ -3751,12 +3865,12 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
     Variadic<Index>:$lowerBound,
     Variadic<Index>:$upperBound,
     Variadic<Index>:$step,
-    Variadic<AnyType>:$reduceOperands,
-    OptionalAttr<ArrayAttr>:$reduceAttrs,
     OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
   );
 
-  let arguments = !con(opArgs, fir_LocalSpecifier.arguments);
+  let arguments = !con(opArgs,
+    fir_LocalSpecifier.arguments,
+    fir_ReduceSpecifier.arguments);
 
   let regions = (region SizedRegion<1>:$region);
 
@@ -3764,9 +3878,17 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    unsigned getNumInductionVars() { return getLowerBound().size(); }
+    unsigned getNumInductionVars() {
+      return getLowerBound().size();
+    }
 
-    unsigned getNumLocalOperands() { return getLocalVars().size(); }
+    unsigned getNumLocalOperands() {
+      return getLocalVars().size();
+    }
+
+    unsigned getNumReduceOperands() {
+      return getReduceVars().size();
+    }
 
     mlir::Block::BlockArgListType getInductionVars() {
       return getBody()->getArguments().slice(0, getNumInductionVars());
@@ -3777,19 +3899,15 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
                                              getNumLocalOperands());
     }
 
+    mlir::Block::BlockArgListType getRegionReduceArgs() {
+      return getBody()->getArguments().slice(getNumInductionVars()
+                                               + getNumLocalOperands(),
+                                             getNumReduceOperands());
+    }
+
     /// Number of operands controlling the loop
     unsigned getNumControlOperands() { return getLowerBound().size() * 3; }
 
-    // Get Number of reduction operands
-    unsigned getNumReduceOperands() {
-      return getReduceOperands().size();
-    }
-
-    mlir::Operation::operand_range getLocalOperands() {
-      return getOperands()
-          .slice(getNumControlOperands() + getNumReduceOperands(),
-                 getNumLocalOperands());
-    }
   }];
 }
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index e3eed6aed8079..32b6ca45609b6 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -578,7 +578,6 @@ class ParseTreeDumper {
   NODE(parser, OmpDetachClause)
   NODE(parser, OmpDoacrossClause)
   NODE(parser, OmpDestroyClause)
-  NODE(parser, OmpEndAllocators)
   NODE(parser, OmpEndBlockDirective)
   NODE(parser, OmpEndCriticalDirective)
   NODE(parser, OmpEndLoopDirective)
@@ -711,8 +710,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPUtilityConstruct)
   NODE(parser, OpenMPDispatchConstruct)
-  NODE(parser, OmpDispatchDirective)
-  NODE(parser, OmpEndDispatchDirective)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPLoopConstruct)
   NODE(parser, OpenMPExecutableAllocate)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 43954ff735361..ab2dde7d5dfbe 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4598,8 +4598,11 @@ struct OmpClauseList {
 struct OmpDirectiveSpecification {
   ENUM_CLASS(Flags, None, DeprecatedSyntax);
   TUPLE_CLASS_BOILERPLATE(OmpDirectiveSpecification);
+  const OmpDirectiveName &DirName() const {
+    return std::get<OmpDirectiveName>(t);
+  }
   llvm::omp::Directive DirId() const { //
-    return std::get<OmpDirectiveName>(t).v;
+    return DirName().v;
   }
   const OmpArgumentList &Arguments() const;
   const OmpClauseList &Clauses() const;
@@ -4839,17 +4842,17 @@ struct OpenMPExecutableAllocate {
       t;
 };
 
-EMPTY_CLASS(OmpEndAllocators);
-
-// 6.7 Allocators construct [OpenMP 5.2]
-//     allocators-construct -> ALLOCATORS [allocate-clause [,]]
-//                                allocate-stmt
-//                             [omp-end-allocators-construct]
+// Ref: [5.2:180-181], [6.0:315]
+//
+// allocators-construct ->
+//    ALLOCATORS [allocate-clause...]
+//    block
+//    [END ALLOCATORS]
 struct OpenMPAllocatorsConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPAllocatorsConstruct);
   CharBlock source;
-  std::tuple<Verbatim, OmpClauseList, Statement<AllocateStmt>,
-      std::optional<OmpEndAllocators>>
+  std::tuple<OmpDirectiveSpecification, Block,
+      std::optional<OmpDirectiveSpecification>>
       t;
 };
 
@@ -4936,19 +4939,11 @@ struct OpenMPDepobjConstruct {
 //                    nocontext-clause |
 //                    novariants-clause |
 //                    nowait-clause
-struct OmpDispatchDirective {
-  TUPLE_CLASS_BOILERPLATE(OmpDispatchDirective);
-  CharBlock source;
-  std::tuple<Verbatim, OmpClauseList> t;
-};
-
-EMPTY_CLASS(OmpEndDispatchDirective);
-
 struct OpenMPDispatchConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPDispatchConstruct);
   CharBlock source;
-  std::tuple<OmpDispatchDirective, Block,
-      std::optional<OmpEndDispatchDirective>>
+  std::tuple<OmpDirectiveSpecification, Block,
+      std::optional<OmpDirectiveSpecification>>
       t;
 };
 
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 3d9f06308d8c1..21e6b3c3dd50d 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1157,23 +1157,28 @@ template semantics::UnorderedSymbolSet CollectCudaSymbols(
 bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   semantics::UnorderedSymbolSet hostSymbols;
   semantics::UnorderedSymbolSet deviceSymbols;
+  semantics::UnorderedSymbolSet cudaSymbols{CollectCudaSymbols(expr)};
 
   SymbolVector symbols{GetSymbolVector(expr)};
   std::reverse(symbols.begin(), symbols.end());
   bool skipNext{false};
   for (const Symbol &sym : symbols) {
-    bool isComponent{sym.owner().IsDerivedType()};
-    bool skipComponent{false};
-    if (!skipNext) {
-      if (IsCUDADeviceSymbol(sym)) {
-        deviceSymbols.insert(sym);
-      } else if (isComponent) {
-        skipComponent = true; // Component is not device. Look on the base.
-      } else {
-        hostSymbols.insert(sym);
+    if (cudaSymbols.find(sym) != cudaSymbols.end()) {
+      bool isComponent{sym.owner().IsDerivedType()};
+      bool skipComponent{false};
+      if (!skipNext) {
+        if (IsCUDADeviceSymbol(sym)) {
+          deviceSymbols.insert(sym);
+        } else if (isComponent) {
+          skipComponent = true; // Component is not device. Look on the base.
+        } else {
+          hostSymbols.insert(sym);
+        }
       }
+      skipNext = isComponent && !skipComponent;
+    } else {
+      skipNext = false;
     }
-    skipNext = isComponent && !skipComponent;
   }
   bool hasConstant{HasConstant(expr)};
   return (hasConstant || (hostSymbols.size() > 0)) && deviceSymbols.size() > 0;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 987fd3095fdf6..33c1f1e7a3c3a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -31,6 +31,7 @@
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/Runtime.h"
 #include "flang/Lower/StatementContext.h"
+#include "flang/Lower/Support/ReductionProcessor.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/CUFCommon.h"
@@ -127,9 +128,8 @@ struct IncrementLoopInfo {
   bool isConcurrent;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
-  llvm::SmallVector<
-      std::pair<fir::ReduceOperationEnum, const Fortran::semantics::Symbol *>>
-      reduceSymList;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reduceSymList;
+  llvm::SmallVector<fir::ReduceOperationEnum> reduceOperatorList;
   llvm::SmallVector<const Fortran::semantics::Symbol *> sharedSymList;
   mlir::Value loopVariable = nullptr;
 
@@ -1993,7 +1993,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     case Fortran::parser::ReductionOperator::Operator::Ior:
       return fir::ReduceOperationEnum::IOR;
     case Fortran::parser::ReductionOperator::Operator::Ieor:
-      return fir::ReduceOperationEnum::EIOR;
+      return fir::ReduceOperationEnum::IEOR;
     }
     llvm_unreachable("illegal reduction operator");
   }
@@ -2027,8 +2027,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               std::get<Fortran::parser::ReductionOperator>(reduceList->t));
           for (const Fortran::parser::Name &x :
                std::get<std::list<Fortran::parser::Name>>(reduceList->t)) {
-            info.reduceSymList.push_back(
-                std::make_pair(reduce_operation, x.symbol));
+            info.reduceSymList.push_back(x.symbol);
+            info.reduceOperatorList.push_back(reduce_operation);
           }
         }
       }
@@ -2089,6 +2089,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         assign.u = Fortran::evaluate::Assignment::BoundsSpec{};
       genAssignment(assign);
     }
+
     for (const Fortran::semantics::Symbol *sym : info.sharedSymList) {
       const auto *hostDetails =
           sym->detailsIf<Fortran::semantics::HostAssocDetails>();
@@ -2112,6 +2113,45 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       }
     }
 
+    llvm::SmallVector<bool> reduceVarByRef;
+    llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+    llvm::SmallVector<mlir::Attribute> nestReduceAttrs;
+
+    for (const auto &reduceOp : info.reduceOperatorList)
+      nestReduceAttrs.push_back(
+          fir::ReduceAttr::get(builder->getContext(), reduceOp));
+
+    llvm::SmallVector<mlir::Value> reduceVars;
+    Fortran::lower::omp::ReductionProcessor rp;
+    rp.processReductionArguments<fir::DeclareReductionOp>(
+        toLocation(), *this, info.reduceOperatorList, reduceVars,
+        reduceVarByRef, reductionDeclSymbols, info.reduceSymList);
+
+    doConcurrentLoopOp.getReduceVarsMutable().assign(reduceVars);
+    doConcurrentLoopOp.setReduceSymsAttr(
+        reductionDeclSymbols.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(builder->getContext(),
+                                   reductionDeclSymbols));
+    doConcurrentLoopOp.setReduceAttrsAttr(
+        nestReduceAttrs.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs));
+    doConcurrentLoopOp.setReduceByrefAttr(
+        reduceVarByRef.empty() ? nullptr
+                               : mlir::DenseBoolArrayAttr::get(
+                                     builder->getContext(), reduceVarByRef));
+
+    for (auto [sym, reduceVar] :
+         llvm::zip_equal(info.reduceSymList, reduceVars)) {
+      auto arg = doConcurrentLoopOp.getRegion().begin()->addArgument(
+          reduceVar.getType(), doConcurrentLoopOp.getLoc());
+      bindSymbol(*sym, hlfir::translateToExtendedValue(
+                           reduceVar.getLoc(), *builder, hlfir::Entity{arg},
+                           /*contiguousHint=*/true)
+                           .first);
+    }
+
     // Note that allocatable, types with ultimate components, and type
     // requiring finalization are forbidden in LOCAL/LOCAL_INIT (F2023 C1130),
     // so no clean-up needs to be generated for these entities.
@@ -2203,6 +2243,12 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       }
     }
 
+    // Introduce a `do concurrent` scope to bind symbols corresponding to local,
+    // local_init, and reduce region arguments.
+    if (!incrementLoopNestInfo.empty() &&
+        incrementLoopNestInfo.back().isConcurrent)
+      localSymbols.pushScope();
+
     // Increment loop begin code. (Infinite/while code was already generated.)
     if (!infiniteLoop && !whileCondition)
       genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs);
@@ -2226,6 +2272,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     // This call may generate a branch in some contexts.
     genFIR(endDoEval, unstructuredContext);
+
+    if (!incrementLoopNestInfo.empty() &&
+        incrementLoopNestInfo.back().isConcurrent)
+      localSymbols.popScope();
   }
 
   /// Generate FIR to evaluate loop control values (lower, upper and step).
@@ -2408,19 +2458,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         info.stepVariable = builder->createTemporary(loc, stepValue.getType());
         builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
       }
-
-      if (genDoConcurrent && nestReduceOperands.empty()) {
-        // Create DO CONCURRENT reduce operands and attributes
-        for (const auto &reduceSym : info.reduceSymList) {
-          const fir::ReduceOperationEnum reduceOperation = reduceSym.first;
-          const Fortran::semantics::Symbol *sym = reduceSym.second;
-          fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
-          nestReduceOperands.push_back(fir::getBase(exv));
-          auto reduceAttr =
-              fir::ReduceAttr::get(builder->getContext(), reduceOperation);
-          nestReduceAttrs.push_back(reduceAttr);
-        }
-      }
     }
 
     for (auto [info, lowerValue, upperValue, stepValue] :
@@ -2518,11 +2555,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
       builder->setInsertionPointToEnd(loopWrapperOp.getBody());
       auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
-          loc, nestLBs, nestUBs, nestSts, nestReduceOperands,
-          nestReduceAttrs.empty()
-              ? nullptr
-              : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs),
-          nullptr, /*local_vars=*/std::nullopt, /*local_syms=*/nullptr);
+          loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr,
+          /*local_vars=*/std::nullopt,
+          /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt,
+          /*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr,
+          /*reduce_attrs=*/nullptr);
 
       llvm::SmallVector<mlir::Type> loopBlockArgTypes(
           incrementLoopNestInfo.size(), builder->getIndexType());
@@ -4842,8 +4879,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                   .detailsIf<Fortran::semantics::ObjectEntityDetails>()) {
         if (details->cudaDataAttr() &&
             *details->cudaDataAttr() != Fortran::common::CUDADataAttr::Pinned) {
-          // TODO: This should probably being checked in semantic and give a
-          // proper error.
           assert(
               nbDeviceResidentObject <= 1 &&
               "Only one reference to the device resident object is supported");
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 8049cdf333173..cd80aaf553869 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -29,11 +29,11 @@ add_flang_library(FortranLower
   OpenMP/DataSharingProcessor.cpp
   OpenMP/Decomposer.cpp
   OpenMP/OpenMP.cpp
-  OpenMP/ReductionProcessor.cpp
   OpenMP/Utils.cpp
   PFTBuilder.cpp
   Runtime.cpp
   Support/PrivateReductionUtils.cpp
+  Support/ReductionProcessor.cpp
   Support/Utils.cpp
   SymbolMap.cpp
   VectorSubscripts.cpp
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 42842bcb41a74..00c9cbf0d2a8f 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -164,14 +164,13 @@ createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
   op.setStructured(structured);
   op.setImplicit(implicit);
   op.setDataClause(dataClause);
-  if (auto mappableTy =
-          mlir::dyn_cast<mlir::acc::MappableType>(baseAddr.getType())) {
-    op.setVarType(baseAddr.getType());
+  if (auto pointerLikeTy =
+          mlir::dyn_cast<mlir::acc::PointerLikeType>(baseAddr.getType())) {
+    op.setVarType(pointerLikeTy.getElementType());
   } else {
-    assert(mlir::isa<mlir::acc::PointerLikeType>(baseAddr.getType()) &&
-           "expected pointer-like");
-    op.setVarType(mlir::cast<mlir::acc::PointerLikeType>(baseAddr.getType())
-                      .getElementType());
+    assert(mlir::isa<mlir::acc::MappableType>(baseAddr.getType()) &&
+           "expected mappable");
+    op.setVarType(baseAddr.getType());
   }
 
   op->setAttr(Op::getOperandSegmentSizeAttr(),
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
index 2ab91b239a3cc..6ea331c370640 100644
--- a/flang/lib/Lower/OpenMP/Atomic.cpp
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "Atomic.h"
-#include "Clauses.h"
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/fold.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/traverse.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/SymbolMap.h"
diff --git a/flang/lib/Lower/OpenMP/ClauseFinder.h b/flang/lib/Lower/OpenMP/ClauseFinder.h
index 3b77f2ca1d4cb..af52585452833 100644
--- a/flang/lib/Lower/OpenMP/ClauseFinder.h
+++ b/flang/lib/Lower/OpenMP/ClauseFinder.h
@@ -12,7 +12,7 @@
 #ifndef FORTRAN_LOWER_CLAUSEFINDER_H
 #define FORTRAN_LOWER_CLAUSEFINDER_H
 
-#include "Clauses.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 
 namespace Fortran {
 namespace lower {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 7bea427099a28..74087d42a8e6e 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -11,11 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClauseProcessor.h"
-#include "Clauses.h"
 #include "Utils.h"
 
 #include "flang/Lower/ConvertExprToHLFIR.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/Support/ReductionProcessor.h"
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/tools.h"
 #include "llvm/Frontend/OpenMP/OMP.h.inc"
@@ -25,6 +26,21 @@ namespace Fortran {
 namespace lower {
 namespace omp {
 
+using ReductionModifier =
+    Fortran::lower::omp::clause::Reduction::ReductionModifier;
+
+mlir::omp::ReductionModifier translateReductionModifier(ReductionModifier mod) {
+  switch (mod) {
+  case ReductionModifier::Default:
+    return mlir::omp::ReductionModifier::defaultmod;
+  case ReductionModifier::Inscan:
+    return mlir::omp::ReductionModifier::inscan;
+  case ReductionModifier::Task:
+    return mlir::omp::ReductionModifier::task;
+  }
+  return mlir::omp::ReductionModifier::defaultmod;
+}
+
 /// Check for unsupported map operand types.
 static void checkMapType(mlir::Location location, mlir::Type type) {
   if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
@@ -1076,6 +1092,18 @@ bool ClauseProcessor::processIf(
   });
   return found;
 }
+
+template <typename T>
+void collectReductionSyms(
+    const T &reduction,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
+  const auto &objectList{std::get<omp::ObjectList>(reduction.t)};
+  for (const Object &object : objectList) {
+    const semantics::Symbol *symbol = object.sym();
+    reductionSyms.push_back(symbol);
+  }
+}
+
 bool ClauseProcessor::processInReduction(
     mlir::Location currentLocation, mlir::omp::InReductionClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &outReductionSyms) const {
@@ -1085,10 +1113,14 @@ bool ClauseProcessor::processInReduction(
         llvm::SmallVector<bool> inReduceVarByRef;
         llvm::SmallVector<mlir::Attribute> inReductionDeclSymbols;
         llvm::SmallVector<const semantics::Symbol *> inReductionSyms;
+        collectReductionSyms(clause, inReductionSyms);
+
         ReductionProcessor rp;
-        rp.processReductionArguments<omp::clause::InReduction>(
-            currentLocation, converter, clause, inReductionVars,
-            inReduceVarByRef, inReductionDeclSymbols, inReductionSyms);
+        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+            currentLocation, converter,
+            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+            inReductionVars, inReduceVarByRef, inReductionDeclSymbols,
+            inReductionSyms);
 
         // Copy local lists into the output.
         llvm::copy(inReductionVars, std::back_inserter(result.inReductionVars));
@@ -1416,10 +1448,23 @@ bool ClauseProcessor::processReduction(
         llvm::SmallVector<bool> reduceVarByRef;
         llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
         llvm::SmallVector<const semantics::Symbol *> reductionSyms;
+        collectReductionSyms(clause, reductionSyms);
+
+        auto mod = std::get<std::optional<ReductionModifier>>(clause.t);
+        if (mod.has_value()) {
+          if (mod.value() == ReductionModifier::Task)
+            TODO(currentLocation, "Reduction modifier `task` is not supported");
+          else
+            result.reductionMod = mlir::omp::ReductionModifierAttr::get(
+                converter.getFirOpBuilder().getContext(),
+                translateReductionModifier(mod.value()));
+        }
+
         ReductionProcessor rp;
-        rp.processReductionArguments<omp::clause::Reduction>(
-            currentLocation, converter, clause, reductionVars, reduceVarByRef,
-            reductionDeclSymbols, reductionSyms, &result.reductionMod);
+        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+            currentLocation, converter,
+            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+            reductionVars, reduceVarByRef, reductionDeclSymbols, reductionSyms);
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
         llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
@@ -1435,21 +1480,25 @@ bool ClauseProcessor::processTaskReduction(
   return findRepeatableClause<omp::clause::TaskReduction>(
       [&](const omp::clause::TaskReduction &clause, const parser::CharBlock &) {
         llvm::SmallVector<mlir::Value> taskReductionVars;
-        llvm::SmallVector<bool> TaskReduceVarByRef;
-        llvm::SmallVector<mlir::Attribute> TaskReductionDeclSymbols;
-        llvm::SmallVector<const semantics::Symbol *> TaskReductionSyms;
+        llvm::SmallVector<bool> taskReduceVarByRef;
+        llvm::SmallVector<mlir::Attribute> taskReductionDeclSymbols;
+        llvm::SmallVector<const semantics::Symbol *> taskReductionSyms;
+        collectReductionSyms(clause, taskReductionSyms);
+
         ReductionProcessor rp;
-        rp.processReductionArguments<omp::clause::TaskReduction>(
-            currentLocation, converter, clause, taskReductionVars,
-            TaskReduceVarByRef, TaskReductionDeclSymbols, TaskReductionSyms);
+        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+            currentLocation, converter,
+            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+            taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols,
+            taskReductionSyms);
         // Copy local lists into the output.
         llvm::copy(taskReductionVars,
                    std::back_inserter(result.taskReductionVars));
-        llvm::copy(TaskReduceVarByRef,
+        llvm::copy(taskReduceVarByRef,
                    std::back_inserter(result.taskReductionByref));
-        llvm::copy(TaskReductionDeclSymbols,
+        llvm::copy(taskReductionDeclSymbols,
                    std::back_inserter(result.taskReductionSyms));
-        llvm::copy(TaskReductionSyms, std::back_inserter(outReductionSyms));
+        llvm::copy(taskReductionSyms, std::back_inserter(outReductionSyms));
       });
 }
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 3d8c4a337a4a4..f8a1f7983b79b 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -13,12 +13,11 @@
 #define FORTRAN_LOWER_CLAUSEPROCESSOR_H
 
 #include "ClauseFinder.h"
-#include "Clauses.h"
-#include "ReductionProcessor.h"
 #include "Utils.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/DirectivesCommon.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/dump-parse-tree.h"
 #include "flang/Parser/parse-tree.h"
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index b599d69a36272..22a07219d3a50 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Clauses.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 
 #include "flang/Common/idioms.h"
 #include "flang/Evaluate/expression.h"
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index fded04c839fb4..ee2fc70d2e673 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -12,9 +12,9 @@
 #ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 #define FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 
-#include "Clauses.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/OpenMP.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/symbol.h"
diff --git a/flang/lib/Lower/OpenMP/Decomposer.cpp b/flang/lib/Lower/OpenMP/Decomposer.cpp
index 251cba9204adc..9bfbf67bec88c 100644
--- a/flang/lib/Lower/OpenMP/Decomposer.cpp
+++ b/flang/lib/Lower/OpenMP/Decomposer.cpp
@@ -12,8 +12,8 @@
 
 #include "Decomposer.h"
 
-#include "Clauses.h"
 #include "Utils.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Tools/CrossToolHelpers.h"
diff --git a/flang/lib/Lower/OpenMP/Decomposer.h b/flang/lib/Lower/OpenMP/Decomposer.h
index e3291b7c59e21..65492bd76280d 100644
--- a/flang/lib/Lower/OpenMP/Decomposer.h
+++ b/flang/lib/Lower/OpenMP/Decomposer.h
@@ -8,7 +8,7 @@
 #ifndef FORTRAN_LOWER_OPENMP_DECOMPOSER_H
 #define FORTRAN_LOWER_OPENMP_DECOMPOSER_H
 
-#include "Clauses.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "llvm/Frontend/OpenMP/ConstructDecompositionT.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0a56e888ac44b..4458f62eea95a 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -14,16 +14,15 @@
 
 #include "Atomic.h"
 #include "ClauseProcessor.h"
-#include "Clauses.h"
 #include "DataSharingProcessor.h"
 #include "Decomposer.h"
-#include "ReductionProcessor.h"
 #include "Utils.h"
 #include "flang/Common/idioms.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/ConvertExpr.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/DirectivesCommon.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index c226c2558e7aa..2e53f01f1da6a 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -12,9 +12,8 @@
 
 #include "Utils.h"
 
-#include "Clauses.h"
-
 #include "ClauseFinder.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
 #include <flang/Lower/DirectivesCommon.h>
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index a7eb2dc5ee664..1526bd4e90233 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -9,7 +9,7 @@
 #ifndef FORTRAN_LOWER_OPENMPUTILS_H
 #define FORTRAN_LOWER_OPENMPUTILS_H
 
-#include "Clauses.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Value.h"
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
similarity index 77%
rename from flang/lib/Lower/OpenMP/ReductionProcessor.cpp
rename to flang/lib/Lower/Support/ReductionProcessor.cpp
index 330cef7b54c74..14b2c9836748f 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -10,10 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ReductionProcessor.h"
+#include "flang/Lower/Support/ReductionProcessor.h"
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/ConvertType.h"
+#include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/Support/PrivateReductionUtils.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Complex.h"
@@ -21,8 +22,6 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
-#include "flang/Optimizer/Support/FatalError.h"
-#include "flang/Parser/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/Support/CommandLine.h"
 #include <type_traits>
@@ -40,35 +39,35 @@ namespace lower {
 namespace omp {
 
 // explicit template declarations
-template void
-ReductionProcessor::processReductionArguments<omp::clause::Reduction>(
+template void ReductionProcessor::processReductionArguments<
+    mlir::omp::DeclareReductionOp, omp::clause::ReductionOperatorList>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
-    const omp::clause::Reduction &reduction,
+    const omp::clause::ReductionOperatorList &redOperatorList,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-    mlir::omp::ReductionModifierAttr *reductionMod);
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
 
-template void
-ReductionProcessor::processReductionArguments<omp::clause::TaskReduction>(
+template void ReductionProcessor::processReductionArguments<
+    fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
-    const omp::clause::TaskReduction &reduction,
+    const llvm::SmallVector<fir::ReduceOperationEnum> &redOperatorList,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-    mlir::omp::ReductionModifierAttr *reductionMod);
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
 
-template void
-ReductionProcessor::processReductionArguments<omp::clause::InReduction>(
-    mlir::Location currentLocation, lower::AbstractConverter &converter,
-    const omp::clause::InReduction &reduction,
-    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-    llvm::SmallVectorImpl<bool> &reduceVarByRef,
-    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-    mlir::omp::ReductionModifierAttr *reductionMod);
+template mlir::omp::DeclareReductionOp
+ReductionProcessor::createDeclareReduction<mlir::omp::DeclareReductionOp>(
+    AbstractConverter &converter, llvm::StringRef reductionOpName,
+    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
+    bool isByRef);
+
+template fir::DeclareReductionOp
+ReductionProcessor::createDeclareReduction<fir::DeclareReductionOp>(
+    AbstractConverter &converter, llvm::StringRef reductionOpName,
+    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
+    bool isByRef);
 
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
     const omp::clause::ProcedureDesignator &pd) {
@@ -106,6 +105,37 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
   }
 }
 
+ReductionProcessor::ReductionIdentifier
+ReductionProcessor::getReductionType(const fir::ReduceOperationEnum &redOp) {
+  switch (redOp) {
+  case fir::ReduceOperationEnum::Add:
+    return ReductionIdentifier::ADD;
+  case fir::ReduceOperationEnum::Multiply:
+    return ReductionIdentifier::MULTIPLY;
+
+  case fir::ReduceOperationEnum::AND:
+    return ReductionIdentifier::AND;
+  case fir::ReduceOperationEnum::OR:
+    return ReductionIdentifier::OR;
+
+  case fir::ReduceOperationEnum::EQV:
+    return ReductionIdentifier::EQV;
+  case fir::ReduceOperationEnum::NEQV:
+    return ReductionIdentifier::NEQV;
+
+  case fir::ReduceOperationEnum::IAND:
+    return ReductionIdentifier::IAND;
+  case fir::ReduceOperationEnum::IEOR:
+    return ReductionIdentifier::IEOR;
+  case fir::ReduceOperationEnum::IOR:
+    return ReductionIdentifier::IOR;
+  case fir::ReduceOperationEnum::MAX:
+    return ReductionIdentifier::MAX;
+  case fir::ReduceOperationEnum::MIN:
+    return ReductionIdentifier::MIN;
+  }
+}
+
 bool ReductionProcessor::supportedIntrinsicProcReduction(
     const omp::clause::ProcedureDesignator &pd) {
   semantics::Symbol *sym = pd.v.sym();
@@ -136,28 +166,29 @@ ReductionProcessor::getReductionName(llvm::StringRef name,
   return fir::getTypeAsString(ty, kindMap, (name + byrefAddition).str());
 }
 
-std::string ReductionProcessor::getReductionName(
-    omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp,
-    const fir::KindMapping &kindMap, mlir::Type ty, bool isByRef) {
+std::string
+ReductionProcessor::getReductionName(ReductionIdentifier redId,
+                                     const fir::KindMapping &kindMap,
+                                     mlir::Type ty, bool isByRef) {
   std::string reductionName;
 
-  switch (intrinsicOp) {
-  case omp::clause::DefinedOperator::IntrinsicOperator::Add:
+  switch (redId) {
+  case ReductionIdentifier::ADD:
     reductionName = "add_reduction";
     break;
-  case omp::clause::DefinedOperator::IntrinsicOperator::Multiply:
+  case ReductionIdentifier::MULTIPLY:
     reductionName = "multiply_reduction";
     break;
-  case omp::clause::DefinedOperator::IntrinsicOperator::AND:
+  case ReductionIdentifier::AND:
     reductionName = "and_reduction";
     break;
-  case omp::clause::DefinedOperator::IntrinsicOperator::EQV:
+  case ReductionIdentifier::EQV:
     reductionName = "eqv_reduction";
     break;
-  case omp::clause::DefinedOperator::IntrinsicOperator::OR:
+  case ReductionIdentifier::OR:
     reductionName = "or_reduction";
     break;
-  case omp::clause::DefinedOperator::IntrinsicOperator::NEQV:
+  case ReductionIdentifier::NEQV:
     reductionName = "neqv_reduction";
     break;
   default:
@@ -334,8 +365,18 @@ mlir::Value ReductionProcessor::createScalarCombiner(
   return reductionOp;
 }
 
+template <typename ParentDeclOpType>
+static void genYield(fir::FirOpBuilder &builder, mlir::Location loc,
+                     mlir::Value yieldedValue) {
+  if constexpr (std::is_same_v<ParentDeclOpType, mlir::omp::DeclareReductionOp>)
+    builder.create<mlir::omp::YieldOp>(loc, yieldedValue);
+  else
+    builder.create<fir::YieldOp>(loc, yieldedValue);
+}
+
 /// Create reduction combiner region for reduction variables which are boxed
 /// arrays
+template <typename DeclRedOpType>
 static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
                            ReductionProcessor::ReductionIdentifier redId,
                            fir::BaseBoxType boxTy, mlir::Value lhs,
@@ -369,7 +410,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Value result = ReductionProcessor::createScalarCombiner(
         builder, loc, redId, eleTy, lhs, rhs);
     builder.create<fir::StoreOp>(loc, result, lhsValAddr);
-    builder.create<mlir::omp::YieldOp>(loc, lhsAddr);
+    genYield<DeclRedOpType>(builder, loc, lhsAddr);
     return;
   }
 
@@ -408,10 +449,11 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
   builder.create<fir::StoreOp>(loc, scalarReduction, lhsEleAddr);
 
   builder.setInsertionPointAfter(nest.outerOp);
-  builder.create<mlir::omp::YieldOp>(loc, lhsAddr);
+  genYield<DeclRedOpType>(builder, loc, lhsAddr);
 }
 
 // generate combiner region for reduction operations
+template <typename DeclRedOpType>
 static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
                         ReductionProcessor::ReductionIdentifier redId,
                         mlir::Type ty, mlir::Value lhs, mlir::Value rhs,
@@ -426,15 +468,15 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
         builder, loc, redId, ty, lhsLoaded, rhsLoaded);
     if (isByRef) {
       builder.create<fir::StoreOp>(loc, result, lhs);
-      builder.create<mlir::omp::YieldOp>(loc, lhs);
+      genYield<DeclRedOpType>(builder, loc, lhs);
     } else {
-      builder.create<mlir::omp::YieldOp>(loc, result);
+      genYield<DeclRedOpType>(builder, loc, result);
     }
     return;
   }
   // all arrays should have been boxed
   if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
-    genBoxCombiner(builder, loc, redId, boxTy, lhs, rhs);
+    genBoxCombiner<DeclRedOpType>(builder, loc, redId, boxTy, lhs, rhs);
     return;
   }
 
@@ -454,15 +496,13 @@ static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
   return ty;
 }
 
+template <typename OpType>
 static void createReductionAllocAndInitRegions(
-    AbstractConverter &converter, mlir::Location loc,
-    mlir::omp::DeclareReductionOp &reductionDecl,
+    AbstractConverter &converter, mlir::Location loc, OpType &reductionDecl,
     const ReductionProcessor::ReductionIdentifier redId, mlir::Type type,
     bool isByRef) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  auto yield = [&](mlir::Value ret) {
-    builder.create<mlir::omp::YieldOp>(loc, ret);
-  };
+  auto yield = [&](mlir::Value ret) { genYield<OpType>(builder, loc, ret); };
 
   mlir::Block *allocBlock = nullptr;
   mlir::Block *initBlock = nullptr;
@@ -489,7 +529,9 @@ static void createReductionAllocAndInitRegions(
         converter, loc, type, initValue, initBlock,
         reductionDecl.getInitializerAllocArg(),
         reductionDecl.getInitializerMoldArg(), reductionDecl.getCleanupRegion(),
-        DeclOperationKind::Reduction);
+        DeclOperationKind::Reduction, /*sym=*/nullptr,
+        /*cannotHaveLowerBounds=*/false,
+        /*isDoConcurrent*/ std::is_same_v<OpType, fir::DeclareReductionOp>);
   }
 
   if (fir::isa_trivial(ty)) {
@@ -512,7 +554,8 @@ static void createReductionAllocAndInitRegions(
   yield(boxAlloca);
 }
 
-mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
+template <typename OpType>
+OpType ReductionProcessor::createDeclareReduction(
     AbstractConverter &converter, llvm::StringRef reductionOpName,
     const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
     bool isByRef) {
@@ -522,8 +565,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
 
   assert(!reductionOpName.empty());
 
-  auto decl =
-      module.lookupSymbol<mlir::omp::DeclareReductionOp>(reductionOpName);
+  auto decl = module.lookupSymbol<OpType>(reductionOpName);
   if (decl)
     return decl;
 
@@ -532,8 +574,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
   if (!isByRef)
     type = valTy;
 
-  decl = modBuilder.create<mlir::omp::DeclareReductionOp>(loc, reductionOpName,
-                                                          type);
+  decl = modBuilder.create<OpType>(loc, reductionOpName, type);
   createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
                                      isByRef);
 
@@ -544,7 +585,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
   builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
   mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
   mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-  genCombiner(builder, loc, redId, type, op1, op2, isByRef);
+  genCombiner<OpType>(builder, loc, redId, type, op1, op2, isByRef);
 
   return decl;
 }
@@ -563,64 +604,41 @@ static bool doReductionByRef(mlir::Value reductionVar) {
   return false;
 }
 
-mlir::omp::ReductionModifier translateReductionModifier(ReductionModifier mod) {
-  switch (mod) {
-  case ReductionModifier::Default:
-    return mlir::omp::ReductionModifier::defaultmod;
-  case ReductionModifier::Inscan:
-    return mlir::omp::ReductionModifier::inscan;
-  case ReductionModifier::Task:
-    return mlir::omp::ReductionModifier::task;
-  }
-  return mlir::omp::ReductionModifier::defaultmod;
-}
-
-template <class T>
+template <typename OpType, typename RedOperatorListTy>
 void ReductionProcessor::processReductionArguments(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
-    const T &reduction, llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+    const RedOperatorListTy &redOperatorList,
+    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols,
-    mlir::omp::ReductionModifierAttr *reductionMod) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-
-  if constexpr (std::is_same_v<T, omp::clause::Reduction>) {
-    auto mod = std::get<std::optional<ReductionModifier>>(reduction.t);
-    if (mod.has_value()) {
-      if (mod.value() == ReductionModifier::Task)
-        TODO(currentLocation, "Reduction modifier `task` is not supported");
-      else
-        *reductionMod = mlir::omp::ReductionModifierAttr::get(
-            firOpBuilder.getContext(), translateReductionModifier(mod.value()));
-    }
-  }
-
-  mlir::omp::DeclareReductionOp decl;
-  const auto &redOperatorList{
-      std::get<typename T::ReductionIdentifiers>(reduction.t)};
-  assert(redOperatorList.size() == 1 && "Expecting single operator");
-  const auto &redOperator = redOperatorList.front();
-  const auto &objectList{std::get<omp::ObjectList>(reduction.t)};
-
-  if (!std::holds_alternative<omp::clause::DefinedOperator>(redOperator.u)) {
-    if (const auto *reductionIntrinsic =
-            std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
-      if (!ReductionProcessor::supportedIntrinsicProcReduction(
-              *reductionIntrinsic)) {
+    const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+  if constexpr (std::is_same_v<RedOperatorListTy,
+                               omp::clause::ReductionOperatorList>) {
+    // For OpenMP reduction clauses, check if the reduction operator is
+    // supported.
+    assert(redOperatorList.size() == 1 && "Expecting single operator");
+    const Fortran::lower::omp::clause::ReductionOperator &redOperator =
+        redOperatorList.front();
+
+    if (!std::holds_alternative<omp::clause::DefinedOperator>(redOperator.u)) {
+      if (const auto *reductionIntrinsic =
+              std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
+        if (!ReductionProcessor::supportedIntrinsicProcReduction(
+                *reductionIntrinsic)) {
+          return;
+        }
+      } else {
         return;
       }
-    } else {
-      return;
     }
   }
 
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
   // Reduction variable processing common to both intrinsic operators and
   // procedure designators
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  for (const Object &object : objectList) {
-    const semantics::Symbol *symbol = object.sym();
-    reductionSymbols.push_back(symbol);
+  for (const semantics::Symbol *symbol : reductionSymbols) {
     mlir::Value symVal = converter.getSymbolAddress(*symbol);
     mlir::Type eleType;
     auto refType = mlir::dyn_cast_or_null<fir::ReferenceType>(symVal.getType());
@@ -672,52 +690,63 @@ void ReductionProcessor::processReductionArguments(
     reduceVarByRef.push_back(doReductionByRef(symVal));
   }
 
+  unsigned idx = 0;
   for (auto [symVal, isByRef] : llvm::zip(reductionVars, reduceVarByRef)) {
     auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
     const auto &kindMap = firOpBuilder.getKindMap();
     std::string reductionName;
     ReductionIdentifier redId;
 
-    if (const auto &redDefinedOp =
-            std::get_if<omp::clause::DefinedOperator>(&redOperator.u)) {
-      const auto &intrinsicOp{
-          std::get<omp::clause::DefinedOperator::IntrinsicOperator>(
-              redDefinedOp->u)};
-      redId = getReductionType(intrinsicOp);
-      switch (redId) {
-      case ReductionIdentifier::ADD:
-      case ReductionIdentifier::MULTIPLY:
-      case ReductionIdentifier::AND:
-      case ReductionIdentifier::EQV:
-      case ReductionIdentifier::OR:
-      case ReductionIdentifier::NEQV:
-        break;
-      default:
-        TODO(currentLocation,
-             "Reduction of some intrinsic operators is not supported");
-        break;
-      }
-
-      reductionName = getReductionName(intrinsicOp, kindMap, redType, isByRef);
-    } else if (const auto *reductionIntrinsic =
-                   std::get_if<omp::clause::ProcedureDesignator>(
-                       &redOperator.u)) {
-      if (!ReductionProcessor::supportedIntrinsicProcReduction(
-              *reductionIntrinsic)) {
-        TODO(currentLocation, "Unsupported intrinsic proc reduction");
+    if constexpr (std::is_same_v<RedOperatorListTy,
+                                 omp::clause::ReductionOperatorList>) {
+      const Fortran::lower::omp::clause::ReductionOperator &redOperator =
+          redOperatorList.front();
+      if (const auto &redDefinedOp =
+              std::get_if<omp::clause::DefinedOperator>(&redOperator.u)) {
+        const auto &intrinsicOp{
+            std::get<omp::clause::DefinedOperator::IntrinsicOperator>(
+                redDefinedOp->u)};
+        redId = getReductionType(intrinsicOp);
+        switch (redId) {
+        case ReductionIdentifier::ADD:
+        case ReductionIdentifier::MULTIPLY:
+        case ReductionIdentifier::AND:
+        case ReductionIdentifier::EQV:
+        case ReductionIdentifier::OR:
+        case ReductionIdentifier::NEQV:
+          break;
+        default:
+          TODO(currentLocation,
+               "Reduction of some intrinsic operators is not supported");
+          break;
+        }
+
+        reductionName = getReductionName(redId, kindMap, redType, isByRef);
+      } else if (const auto *reductionIntrinsic =
+                     std::get_if<omp::clause::ProcedureDesignator>(
+                         &redOperator.u)) {
+        if (!ReductionProcessor::supportedIntrinsicProcReduction(
+                *reductionIntrinsic)) {
+          TODO(currentLocation, "Unsupported intrinsic proc reduction");
+        }
+        redId = getReductionType(*reductionIntrinsic);
+        reductionName =
+            getReductionName(getRealName(*reductionIntrinsic).ToString(),
+                             kindMap, redType, isByRef);
+      } else {
+        TODO(currentLocation, "Unexpected reduction type");
       }
-      redId = getReductionType(*reductionIntrinsic);
-      reductionName =
-          getReductionName(getRealName(*reductionIntrinsic).ToString(), kindMap,
-                           redType, isByRef);
     } else {
-      TODO(currentLocation, "Unexpected reduction type");
+      // `do concurrent` reductions
+      redId = getReductionType(redOperatorList[idx]);
+      reductionName = getReductionName(redId, kindMap, redType, isByRef);
     }
 
-    decl = createDeclareReduction(converter, reductionName, redId, redType,
-                                  currentLocation, isByRef);
+    OpType decl = createDeclareReduction<OpType>(
+        converter, reductionName, redId, redType, currentLocation, isByRef);
     reductionDeclSymbols.push_back(
         mlir::SymbolRefAttr::get(firOpBuilder.getContext(), decl.getSymName()));
+    ++idx;
   }
 }
 
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index c65f51ce6cacd..b9d2574a76ad0 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -668,9 +668,7 @@ void privatizeSymbol(
 
   const semantics::Symbol *sym =
       isDoConcurrent ? &symToPrivatize->GetUltimate() : symToPrivatize;
-  const lower::SymbolBox hsb = isDoConcurrent
-                                   ? converter.shallowLookupSymbol(*sym)
-                                   : converter.lookupOneLevelUpSymbol(*sym);
+  const lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
   assert(hsb && "Host symbol box not found");
 
   mlir::Location symLoc = hsb.getAddr().getLoc();
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index b5cabdb830e5c..acd5a88a2582d 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -286,6 +286,9 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto firLocalOp = getRegion().getParentOfType<fir::LocalitySpecifierOp>())
     return &getRegion().front();
 
+  if (auto firLocalOp = getRegion().getParentOfType<fir::DeclareReductionOp>())
+    return &getRegion().front();
+
   return getEntryBlock();
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 3bbc32f23bcfa..ecc04a6c9a2be 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2239,18 +2239,17 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
         getSubcomponentIndices(rebox, rebox.getBox(), operands, fieldIndices);
       if (!rebox.getSubstr().empty())
         substringOffset = operands[rebox.getSubstrOperandIndex()];
-      base =
-          genBoxOffsetGep(rewriter, loc, base, llvmBaseObjectType, zero,
-                          /*cstInteriorIndices=*/llvm::ArrayRef<mlir::Value>(),
-                          fieldIndices, substringOffset);
+      base = genBoxOffsetGep(rewriter, loc, base, llvmBaseObjectType, zero,
+                             /*cstInteriorIndices=*/{}, fieldIndices,
+                             substringOffset);
     }
 
     if (rebox.getSlice().empty())
       // The array section is of the form array[%component][substring], keep
       // the input array extents and strides.
       return finalizeRebox(rebox, adaptor, destBoxTy, dest, base,
-                           /*lbounds*/ llvm::ArrayRef<mlir::Value>(),
-                           inputExtents, inputStrides, rewriter);
+                           /*lbounds*/ {}, inputExtents, inputStrides,
+                           rewriter);
 
     // The slice is of the form array(i:j:k)[%component]. Compute new extents
     // and strides.
@@ -2298,8 +2297,8 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
       }
     }
     return finalizeRebox(rebox, adaptor, destBoxTy, dest, base,
-                         /*lbounds*/ llvm::ArrayRef<mlir::Value>(),
-                         slicedExtents, slicedStrides, rewriter);
+                         /*lbounds*/ {}, slicedExtents, slicedStrides,
+                         rewriter);
   }
 
   /// Apply a new shape to the data described by a box given the base address,
@@ -3342,26 +3341,26 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   }
 };
 
-struct LocalitySpecifierOpConversion
-    : public fir::FIROpConversion<fir::LocalitySpecifierOp> {
-  using FIROpConversion::FIROpConversion;
+template <typename OpTy>
+struct DoConcurrentSpecifierOpConversion : public fir::FIROpConversion<OpTy> {
+  using fir::FIROpConversion<OpTy>::FIROpConversion;
   llvm::LogicalResult
-  matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor,
+  matchAndRewrite(OpTy specifier, typename OpTy::Adaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
 #ifdef EXPENSIVE_CHECKS
     auto uses = mlir::SymbolTable::getSymbolUses(
-        localizer, localizer->getParentOfType<mlir::ModuleOp>());
+        specifier, specifier->getParentOfType<mlir::ModuleOp>());
 
-    // `fir.local` ops are not supposed to have any uses at this point (i.e.
-    // during lowering to LLVM). In case of serialization, the
-    // `fir.do_concurrent` users are expected to have been lowered to
+    // `fir.local|fir.declare_reduction` ops are not supposed to have any uses
+    // at this point (i.e. during lowering to LLVM). In case of serialization,
+    // the `fir.do_concurrent` users are expected to have been lowered to
     // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent`
     // users are expected to have been lowered to the target parallel model
     // (e.g. OpenMP).
     assert(uses && uses->empty());
 #endif
 
-    rewriter.eraseOp(localizer);
+    rewriter.eraseOp(specifier);
     return mlir::success();
   }
 };
@@ -3397,8 +3396,7 @@ static void genBrOp(A caseOp, mlir::Block *dest, std::optional<B> destOps,
   if (destOps)
     rewriter.replaceOpWithNewOp<mlir::LLVM::BrOp>(caseOp, *destOps, dest);
   else
-    rewriter.replaceOpWithNewOp<mlir::LLVM::BrOp>(
-        caseOp, llvm::ArrayRef<mlir::Value>(), dest);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::BrOp>(caseOp, B{}, dest);
 }
 
 static void genCaseLadderStep(mlir::Location loc, mlir::Value cmp,
@@ -4330,20 +4328,22 @@ void fir::populateFIRToLLVMConversionPatterns(
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion,
       CoordinateOpConversion, CopyOpConversion, DTEntryOpConversion,
-      DeclareOpConversion, DivcOpConversion, EmboxOpConversion,
-      EmboxCharOpConversion, EmboxProcOpConversion, ExtractValueOpConversion,
-      FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion,
-      GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion,
-      IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
-      LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion,
-      NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion,
-      SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion,
-      ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion,
-      StoreOpConversion, StringLitOpConversion, SubcOpConversion,
-      TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion,
-      UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
-      XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion,
-      ZeroOpConversion>(converter, options);
+      DeclareOpConversion,
+      DoConcurrentSpecifierOpConversion<fir::LocalitySpecifierOp>,
+      DoConcurrentSpecifierOpConversion<fir::DeclareReductionOp>,
+      DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion,
+      EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion,
+      FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion,
+      GlobalOpConversion, InsertOnRangeOpConversion, IsPresentOpConversion,
+      LenParamIndexOpConversion, LoadOpConversion, MulcOpConversion,
+      NegcOpConversion, NoReassocOpConversion, SelectCaseOpConversion,
+      SelectOpConversion, SelectRankOpConversion, SelectTypeOpConversion,
+      ShapeOpConversion, ShapeShiftOpConversion, ShiftOpConversion,
+      SliceOpConversion, StoreOpConversion, StringLitOpConversion,
+      SubcOpConversion, TypeDescOpConversion, TypeInfoOpConversion,
+      UnboxCharOpConversion, UnboxProcOpConversion, UndefOpConversion,
+      UnreachableOpConversion, XArrayCoorOpConversion, XEmboxOpConversion,
+      XReboxOpConversion, ZeroOpConversion>(converter, options);
 
   // Patterns that are populated without a type converter do not trigger
   // target materializations for the operands of the root op.
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index eca2c7f7c942f..b60ac11c7795a 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -107,9 +107,8 @@ class EmboxConversion : public mlir::OpRewritePattern<fir::EmboxOp> {
       shapeOpers.push_back(extVal);
     }
     auto xbox = rewriter.create<fir::cg::XEmboxOp>(
-        loc, embox.getType(), embox.getMemref(), shapeOpers,
-        llvm::ArrayRef<mlir::Value>(), llvm::ArrayRef<mlir::Value>(),
-        llvm::ArrayRef<mlir::Value>(), llvm::ArrayRef<mlir::Value>(),
+        loc, embox.getType(), embox.getMemref(), shapeOpers, mlir::ValueRange{},
+        mlir::ValueRange{}, mlir::ValueRange{}, mlir::ValueRange{},
         embox.getTypeparams(), embox.getSourceBox(),
         embox.getAllocatorIdxAttr());
     LLVM_DEBUG(llvm::dbgs() << "rewriting " << embox << " to " << xbox << '\n');
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index ecfa2939e96a6..6b40e7015fdd8 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -5041,6 +5041,9 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
 // LocalitySpecifierOp
 //===----------------------------------------------------------------------===//
 
+// TODO This is a copy of omp::PrivateClauseOp::verifiyRegions(). Once we find a
+// solution to merge both ops into one this duplication will not be needed. See:
+// https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108.
 llvm::LogicalResult fir::LocalitySpecifierOp::verifyRegions() {
   mlir::Type argType = getArgType();
   auto verifyTerminator = [&](mlir::Operation *terminator,
@@ -5136,6 +5139,84 @@ llvm::LogicalResult fir::LocalitySpecifierOp::verifyRegions() {
   return llvm::success();
 }
 
+// TODO This is a copy of omp::DeclareReductionOp::verifiyRegions(). Once we
+// find a solution to merge both ops into one this duplication will not be
+// needed.
+mlir::LogicalResult fir::DeclareReductionOp::verifyRegions() {
+  if (!getAllocRegion().empty()) {
+    for (YieldOp yieldOp : getAllocRegion().getOps<YieldOp>()) {
+      if (yieldOp.getResults().size() != 1 ||
+          yieldOp.getResults().getTypes()[0] != getType())
+        return emitOpError() << "expects alloc region to yield a value "
+                                "of the reduction type";
+    }
+  }
+
+  if (getInitializerRegion().empty())
+    return emitOpError() << "expects non-empty initializer region";
+  mlir::Block &initializerEntryBlock = getInitializerRegion().front();
+
+  if (initializerEntryBlock.getNumArguments() == 1) {
+    if (!getAllocRegion().empty())
+      return emitOpError() << "expects two arguments to the initializer region "
+                              "when an allocation region is used";
+  } else if (initializerEntryBlock.getNumArguments() == 2) {
+    if (getAllocRegion().empty())
+      return emitOpError() << "expects one argument to the initializer region "
+                              "when no allocation region is used";
+  } else {
+    return emitOpError()
+           << "expects one or two arguments to the initializer region";
+  }
+
+  for (mlir::Value arg : initializerEntryBlock.getArguments())
+    if (arg.getType() != getType())
+      return emitOpError() << "expects initializer region argument to match "
+                              "the reduction type";
+
+  for (YieldOp yieldOp : getInitializerRegion().getOps<YieldOp>()) {
+    if (yieldOp.getResults().size() != 1 ||
+        yieldOp.getResults().getTypes()[0] != getType())
+      return emitOpError() << "expects initializer region to yield a value "
+                              "of the reduction type";
+  }
+
+  if (getReductionRegion().empty())
+    return emitOpError() << "expects non-empty reduction region";
+  mlir::Block &reductionEntryBlock = getReductionRegion().front();
+  if (reductionEntryBlock.getNumArguments() != 2 ||
+      reductionEntryBlock.getArgumentTypes()[0] !=
+          reductionEntryBlock.getArgumentTypes()[1] ||
+      reductionEntryBlock.getArgumentTypes()[0] != getType())
+    return emitOpError() << "expects reduction region with two arguments of "
+                            "the reduction type";
+  for (YieldOp yieldOp : getReductionRegion().getOps<YieldOp>()) {
+    if (yieldOp.getResults().size() != 1 ||
+        yieldOp.getResults().getTypes()[0] != getType())
+      return emitOpError() << "expects reduction region to yield a value "
+                              "of the reduction type";
+  }
+
+  if (!getAtomicReductionRegion().empty()) {
+    mlir::Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
+    if (atomicReductionEntryBlock.getNumArguments() != 2 ||
+        atomicReductionEntryBlock.getArgumentTypes()[0] !=
+            atomicReductionEntryBlock.getArgumentTypes()[1])
+      return emitOpError() << "expects atomic reduction region with two "
+                              "arguments of the same type";
+  }
+
+  if (getCleanupRegion().empty())
+    return mlir::success();
+  mlir::Block &cleanupEntryBlock = getCleanupRegion().front();
+  if (cleanupEntryBlock.getNumArguments() != 1 ||
+      cleanupEntryBlock.getArgument(0).getType() != getType())
+    return emitOpError() << "expects cleanup region with one argument "
+                            "of the reduction type";
+
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // DoConcurrentOp
 //===----------------------------------------------------------------------===//
@@ -5157,6 +5238,97 @@ llvm::LogicalResult fir::DoConcurrentOp::verify() {
 // DoConcurrentLoopOp
 //===----------------------------------------------------------------------===//
 
+static mlir::ParseResult parseSpecifierList(
+    mlir::OpAsmParser &parser, mlir::OperationState &result,
+    llvm::StringRef specifierKeyword, llvm::StringRef symsAttrName,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::Argument> &regionArgs,
+    llvm::SmallVectorImpl<mlir::Type> &regionArgTypes,
+    int32_t &numSpecifierOperands, bool isReduce = false) {
+  auto &builder = parser.getBuilder();
+  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> specifierOperands;
+
+  if (failed(parser.parseOptionalKeyword(specifierKeyword)))
+    return mlir::success();
+
+  std::size_t oldArgTypesSize = regionArgTypes.size();
+  if (failed(parser.parseLParen()))
+    return mlir::failure();
+
+  llvm::SmallVector<bool> isByRefVec;
+  llvm::SmallVector<mlir::SymbolRefAttr> spceifierSymbolVec;
+  llvm::SmallVector<fir::ReduceAttr> attributes;
+
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (isReduce)
+          isByRefVec.push_back(
+              parser.parseOptionalKeyword("byref").succeeded());
+
+        if (failed(parser.parseAttribute(spceifierSymbolVec.emplace_back())))
+          return mlir::failure();
+
+        if (isReduce &&
+            failed(parser.parseAttribute(attributes.emplace_back())))
+          return mlir::failure();
+
+        if (parser.parseOperand(specifierOperands.emplace_back()) ||
+            parser.parseArrow() ||
+            parser.parseArgument(regionArgs.emplace_back()))
+          return mlir::failure();
+
+        return mlir::success();
+      })))
+    return mlir::failure();
+
+  if (failed(parser.parseColon()))
+    return mlir::failure();
+
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (failed(parser.parseType(regionArgTypes.emplace_back())))
+          return mlir::failure();
+
+        return mlir::success();
+      })))
+    return mlir::failure();
+
+  if (regionArgs.size() != regionArgTypes.size())
+    return parser.emitError(parser.getNameLoc(), "mismatch in number of " +
+                                                     specifierKeyword.str() +
+                                                     " arg and types");
+
+  if (failed(parser.parseRParen()))
+    return mlir::failure();
+
+  for (auto operandType :
+       llvm::zip_equal(specifierOperands,
+                       llvm::drop_begin(regionArgTypes, oldArgTypesSize)))
+    if (parser.resolveOperand(std::get<0>(operandType),
+                              std::get<1>(operandType), result.operands))
+      return mlir::failure();
+
+  if (isReduce)
+    result.addAttribute(
+        fir::DoConcurrentLoopOp::getReduceByrefAttrName(result.name),
+        isByRefVec.empty()
+            ? nullptr
+            : mlir::DenseBoolArrayAttr::get(builder.getContext(), isByRefVec));
+
+  llvm::SmallVector<mlir::Attribute> symbolAttrs(spceifierSymbolVec.begin(),
+                                                 spceifierSymbolVec.end());
+  result.addAttribute(symsAttrName, builder.getArrayAttr(symbolAttrs));
+
+  if (isReduce) {
+    llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                                 attributes.end());
+    result.addAttribute(
+        fir::DoConcurrentLoopOp::getReduceAttrsAttrName(result.name),
+        builder.getArrayAttr(arrayAttr));
+  }
+
+  numSpecifierOperands = specifierOperands.size();
+
+  return mlir::success();
+}
+
 mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
                                                  mlir::OperationState &result) {
   auto &builder = parser.getBuilder();
@@ -5192,90 +5364,26 @@ mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser,
       parser.resolveOperands(steps, builder.getIndexType(), result.operands))
     return mlir::failure();
 
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands;
-  llvm::SmallVector<mlir::Type> reduceArgTypes;
-  if (succeeded(parser.parseOptionalKeyword("reduce"))) {
-    // Parse reduction attributes and variables.
-    llvm::SmallVector<fir::ReduceAttr> attributes;
-    if (failed(parser.parseCommaSeparatedList(
-            mlir::AsmParser::Delimiter::Paren, [&]() {
-              if (parser.parseAttribute(attributes.emplace_back()) ||
-                  parser.parseArrow() ||
-                  parser.parseOperand(reduceOperands.emplace_back()) ||
-                  parser.parseColonType(reduceArgTypes.emplace_back()))
-                return mlir::failure();
-              return mlir::success();
-            })))
-      return mlir::failure();
-    // Resolve input operands.
-    for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes))
-      if (parser.resolveOperand(std::get<0>(operand_type),
-                                std::get<1>(operand_type), result.operands))
-        return mlir::failure();
-    llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
-                                                 attributes.end());
-    result.addAttribute(getReduceAttrsAttrName(result.name),
-                        builder.getArrayAttr(arrayAttr));
-  }
-
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> localOperands;
-  if (succeeded(parser.parseOptionalKeyword("local"))) {
-    std::size_t oldArgTypesSize = argTypes.size();
-    if (failed(parser.parseLParen()))
-      return mlir::failure();
-
-    llvm::SmallVector<mlir::SymbolRefAttr> localSymbolVec;
-    if (failed(parser.parseCommaSeparatedList([&]() {
-          if (failed(parser.parseAttribute(localSymbolVec.emplace_back())))
-            return mlir::failure();
-
-          if (parser.parseOperand(localOperands.emplace_back()) ||
-              parser.parseArrow() ||
-              parser.parseArgument(regionArgs.emplace_back()))
-            return mlir::failure();
-
-          return mlir::success();
-        })))
-      return mlir::failure();
-
-    if (failed(parser.parseColon()))
-      return mlir::failure();
-
-    if (failed(parser.parseCommaSeparatedList([&]() {
-          if (failed(parser.parseType(argTypes.emplace_back())))
-            return mlir::failure();
-
-          return mlir::success();
-        })))
-      return mlir::failure();
-
-    if (regionArgs.size() != argTypes.size())
-      return parser.emitError(parser.getNameLoc(),
-                              "mismatch in number of local arg and types");
-
-    if (failed(parser.parseRParen()))
-      return mlir::failure();
-
-    for (auto operandType : llvm::zip_equal(
-             localOperands, llvm::drop_begin(argTypes, oldArgTypesSize)))
-      if (parser.resolveOperand(std::get<0>(operandType),
-                                std::get<1>(operandType), result.operands))
-        return mlir::failure();
+  int32_t numLocalOperands = 0;
+  if (failed(parseSpecifierList(parser, result, "local",
+                                getLocalSymsAttrName(result.name), regionArgs,
+                                argTypes, numLocalOperands)))
+    return mlir::failure();
 
-    llvm::SmallVector<mlir::Attribute> symbolAttrs(localSymbolVec.begin(),
-                                                   localSymbolVec.end());
-    result.addAttribute(getLocalSymsAttrName(result.name),
-                        builder.getArrayAttr(symbolAttrs));
-  }
+  int32_t numReduceOperands = 0;
+  if (failed(parseSpecifierList(
+          parser, result, "reduce", getReduceSymsAttrName(result.name),
+          regionArgs, argTypes, numReduceOperands, /*isReduce=*/true)))
+    return mlir::failure();
 
   // Set `operandSegmentSizes` attribute.
-  result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr(
-                          {static_cast<int32_t>(lower.size()),
-                           static_cast<int32_t>(upper.size()),
-                           static_cast<int32_t>(steps.size()),
-                           static_cast<int32_t>(reduceOperands.size()),
-                           static_cast<int32_t>(localOperands.size())}));
+  result.addAttribute(
+      DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lower.size()),
+                                    static_cast<int32_t>(upper.size()),
+                                    static_cast<int32_t>(steps.size()),
+                                    static_cast<int32_t>(numLocalOperands),
+                                    static_cast<int32_t>(numReduceOperands)}));
 
   // Now parse the body.
   for (auto [arg, type] : llvm::zip_equal(regionArgs, argTypes))
@@ -5297,17 +5405,6 @@ void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
     << ") = (" << getLowerBound() << ") to (" << getUpperBound() << ") step ("
     << getStep() << ")";
 
-  if (!getReduceOperands().empty()) {
-    p << " reduce(";
-    auto attrs = getReduceAttrsAttr();
-    auto operands = getReduceOperands();
-    llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) {
-      p << std::get<0>(it) << " -> " << std::get<1>(it) << " : "
-        << std::get<1>(it).getType();
-    });
-    p << ')';
-  }
-
   if (!getLocalVars().empty()) {
     p << " local(";
     llvm::interleaveComma(llvm::zip_equal(getLocalSymsAttr(), getLocalVars(),
@@ -5322,13 +5419,34 @@ void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) {
     p << ")";
   }
 
+  if (!getReduceVars().empty()) {
+    p << " reduce(";
+    llvm::interleaveComma(
+        llvm::zip_equal(getReduceByrefAttr().asArrayRef(), getReduceSymsAttr(),
+                        getReduceAttrsAttr(), getReduceVars(),
+                        getRegionReduceArgs()),
+        p, [&](auto it) {
+          if (std::get<0>(it))
+            p << "byref ";
+
+          p << std::get<1>(it) << " " << std::get<2>(it) << " "
+            << std::get<3>(it) << " -> " << std::get<4>(it);
+        });
+    p << " : ";
+    llvm::interleaveComma(getReduceVars(), p,
+                          [&](auto it) { p << it.getType(); });
+    p << ")";
+  }
+
   p << ' ';
   p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
   p.printOptionalAttrDict(
       (*this)->getAttrs(),
       /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(),
+                       DoConcurrentLoopOp::getLocalSymsAttrName(),
+                       DoConcurrentLoopOp::getReduceSymsAttrName(),
                        DoConcurrentLoopOp::getReduceAttrsAttrName(),
-                       DoConcurrentLoopOp::getLocalSymsAttrName()});
+                       DoConcurrentLoopOp::getReduceByrefAttrName()});
 }
 
 llvm::SmallVector<mlir::Region *> fir::DoConcurrentLoopOp::getLoopRegions() {
@@ -5340,6 +5458,7 @@ llvm::LogicalResult fir::DoConcurrentLoopOp::verify() {
   mlir::Operation::operand_range ubValues = getUpperBound();
   mlir::Operation::operand_range stepValues = getStep();
   mlir::Operation::operand_range localVars = getLocalVars();
+  mlir::Operation::operand_range reduceVars = getReduceVars();
 
   if (lbValues.empty())
     return emitOpError(
@@ -5353,7 +5472,8 @@ llvm::LogicalResult fir::DoConcurrentLoopOp::verify() {
   // Check that the body defines the same number of block arguments as the
   // number of tuple elements in step.
   mlir::Block *body = getBody();
-  unsigned numIndVarArgs = body->getNumArguments() - localVars.size();
+  unsigned numIndVarArgs =
+      body->getNumArguments() - localVars.size() - reduceVars.size();
 
   if (numIndVarArgs != stepValues.size())
     return emitOpError() << "expects the same number of induction variables: "
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 2ff1d6d945ba3..4a9579cfde37c 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -1533,7 +1533,9 @@ std::optional<std::pair<uint64_t, unsigned short>>
 fir::getTypeSizeAndAlignment(mlir::Location loc, mlir::Type ty,
                              const mlir::DataLayout &dl,
                              const fir::KindMapping &kindMap) {
-  if (mlir::isa<mlir::IntegerType, mlir::FloatType, mlir::ComplexType>(ty)) {
+  if (ty.isIntOrIndexOrFloat() ||
+      mlir::isa<mlir::ComplexType, mlir::VectorType,
+                mlir::DataLayoutTypeInterface>(ty)) {
     llvm::TypeSize size = dl.getTypeSize(ty);
     unsigned short alignment = dl.getTypeABIAlignment(ty);
     return std::pair{size, alignment};
diff --git a/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp
index 317a41a2129c3..0767733f53728 100644
--- a/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp
@@ -29,8 +29,9 @@
 
 namespace fir::acc {
 
-static mlir::TypedValue<mlir::acc::PointerLikeType>
-getPtrFromVar(mlir::Value var) {
+template <typename Ty>
+mlir::TypedValue<mlir::acc::PointerLikeType>
+OpenACCMappableModel<Ty>::getVarPtr(mlir::Type type, mlir::Value var) const {
   if (auto ptr =
           mlir::dyn_cast<mlir::TypedValue<mlir::acc::PointerLikeType>>(var))
     return ptr;
@@ -44,34 +45,51 @@ getPtrFromVar(mlir::Value var) {
   return {};
 }
 
-template <>
-mlir::TypedValue<mlir::acc::PointerLikeType>
-OpenACCMappableModel<fir::SequenceType>::getVarPtr(mlir::Type type,
-                                                   mlir::Value var) const {
-  return getPtrFromVar(var);
-}
-
-template <>
-mlir::TypedValue<mlir::acc::PointerLikeType>
+template mlir::TypedValue<mlir::acc::PointerLikeType>
 OpenACCMappableModel<fir::BaseBoxType>::getVarPtr(mlir::Type type,
-                                                  mlir::Value var) const {
-  return getPtrFromVar(var);
-}
+                                                  mlir::Value var) const;
 
-template <>
-std::optional<llvm::TypeSize>
-OpenACCMappableModel<fir::SequenceType>::getSizeInBytes(
+template mlir::TypedValue<mlir::acc::PointerLikeType>
+OpenACCMappableModel<fir::ReferenceType>::getVarPtr(mlir::Type type,
+                                                    mlir::Value var) const;
+
+template mlir::TypedValue<mlir::acc::PointerLikeType>
+OpenACCMappableModel<fir::HeapType>::getVarPtr(mlir::Type type,
+                                               mlir::Value var) const;
+
+template mlir::TypedValue<mlir::acc::PointerLikeType>
+OpenACCMappableModel<fir::PointerType>::getVarPtr(mlir::Type type,
+                                                  mlir::Value var) const;
+
+template <typename Ty>
+std::optional<llvm::TypeSize> OpenACCMappableModel<Ty>::getSizeInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
     const mlir::DataLayout &dataLayout) const {
-  // TODO: Bounds operation affect the total size - add support to take them
+  // TODO: Bounds operation affect the size - add support to take them
   // into account.
   if (!accBounds.empty())
     return {};
 
+  // Class-type is either a polymorphic or unlimited polymorphic. In the latter
+  // case, the size is not computable. But in the former it should be - however,
+  // fir::getTypeSizeAndAlignment does not support polymorphic types.
+  if (mlir::isa<fir::ClassType>(type)) {
+    return {};
+  }
+
+  // When requesting the size of a box entity or a reference, the intent
+  // is to get the size of the data that it is referring to.
+  mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type);
+  assert(eleTy && "expect to be able to unwrap the element type");
+
+  // If the type enclosed is a mappable type, then have it provide the size.
+  if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(eleTy))
+    return mappableTy.getSizeInBytes(var, accBounds, dataLayout);
+
   // Dynamic extents or unknown ranks generally do not have compile-time
   // computable dimensions.
-  auto seqType = mlir::cast<fir::SequenceType>(type);
-  if (seqType.hasDynamicExtents() || seqType.hasUnknownShape())
+  auto seqType = mlir::dyn_cast<fir::SequenceType>(eleTy);
+  if (seqType && (seqType.hasDynamicExtents() || seqType.hasUnknownShape()))
     return {};
 
   // Attempt to find an operation that a lookup for KindMapping can be done
@@ -85,99 +103,113 @@ OpenACCMappableModel<fir::SequenceType>::getSizeInBytes(
   auto kindMap = fir::getKindMapping(kindMapSrcOp);
 
   auto sizeAndAlignment =
-      fir::getTypeSizeAndAlignment(var.getLoc(), type, dataLayout, kindMap);
+      fir::getTypeSizeAndAlignment(var.getLoc(), eleTy, dataLayout, kindMap);
   if (!sizeAndAlignment.has_value())
     return {};
 
   return {llvm::TypeSize::getFixed(sizeAndAlignment->first)};
 }
 
-template <>
-std::optional<llvm::TypeSize>
+template std::optional<llvm::TypeSize>
 OpenACCMappableModel<fir::BaseBoxType>::getSizeInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
-    const mlir::DataLayout &dataLayout) const {
-  // If we have a box value instead of box reference, the intent is to
-  // get the size of the data not the box itself.
-  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(var.getType())) {
-    if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(
-            fir::unwrapRefType(boxTy.getEleTy()))) {
-      return mappableTy.getSizeInBytes(var, accBounds, dataLayout);
-    }
-  }
-  // Size for boxes is not computable until it gets materialized.
-  return {};
-}
+    const mlir::DataLayout &dataLayout) const;
 
-template <>
-std::optional<int64_t>
-OpenACCMappableModel<fir::SequenceType>::getOffsetInBytes(
+template std::optional<llvm::TypeSize>
+OpenACCMappableModel<fir::ReferenceType>::getSizeInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+template std::optional<llvm::TypeSize>
+OpenACCMappableModel<fir::HeapType>::getSizeInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+template std::optional<llvm::TypeSize>
+OpenACCMappableModel<fir::PointerType>::getSizeInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+template <typename Ty>
+std::optional<int64_t> OpenACCMappableModel<Ty>::getOffsetInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
     const mlir::DataLayout &dataLayout) const {
-  // TODO: Bounds operation affect the offset- add support to take them
+  // TODO: Bounds operation affect the offset - add support to take them
   // into account.
   if (!accBounds.empty())
     return {};
 
+  // Class-type does not behave like a normal box because it does not hold an
+  // element type. Thus special handle it here.
+  if (mlir::isa<fir::ClassType>(type)) {
+    // The pointer to the class-type is always at the start address.
+    return {0};
+  }
+
+  mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type);
+  assert(eleTy && "expect to be able to unwrap the element type");
+
+  // If the type enclosed is a mappable type, then have it provide the offset.
+  if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(eleTy))
+    return mappableTy.getOffsetInBytes(var, accBounds, dataLayout);
+
   // Dynamic extents (aka descriptor-based arrays) - may have a offset.
   // For example, a negative stride may mean a negative offset to compute the
   // start of array.
-  auto seqType = mlir::cast<fir::SequenceType>(type);
-  if (seqType.hasDynamicExtents() || seqType.hasUnknownShape())
+  auto seqType = mlir::dyn_cast<fir::SequenceType>(eleTy);
+  if (seqType && (seqType.hasDynamicExtents() || seqType.hasUnknownShape()))
     return {};
 
-  // We have non-dynamic extents - but if for some reason the size is not
-  // computable - assume offset is not either. Otherwise, it is an offset of
-  // zero.
+  // If the size is computable and since there are no bounds or dynamic extents,
+  // then the offset relative to pointer must be zero.
   if (getSizeInBytes(type, var, accBounds, dataLayout).has_value()) {
     return {0};
   }
+
+  // The offset is not evident because it is relative to the pointer being held.
+  // And we don't have any further details about this type.
   return {};
 }
 
-template <>
-std::optional<int64_t> OpenACCMappableModel<fir::BaseBoxType>::getOffsetInBytes(
+template std::optional<int64_t>
+OpenACCMappableModel<fir::BaseBoxType>::getOffsetInBytes(
     mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
-    const mlir::DataLayout &dataLayout) const {
-  // If we have a box value instead of box reference, the intent is to
-  // get the offset of the data not the offset of the box itself.
-  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(var.getType())) {
-    if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(
-            fir::unwrapRefType(boxTy.getEleTy()))) {
-      return mappableTy.getOffsetInBytes(var, accBounds, dataLayout);
-    }
-  }
-  // Until boxes get materialized, the offset is not evident because it is
-  // relative to the pointer being held.
-  return {};
-}
+    const mlir::DataLayout &dataLayout) const;
 
-template <>
-llvm::SmallVector<mlir::Value>
-OpenACCMappableModel<fir::SequenceType>::generateAccBounds(
-    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const {
+template std::optional<int64_t>
+OpenACCMappableModel<fir::ReferenceType>::getOffsetInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+template std::optional<int64_t>
+OpenACCMappableModel<fir::HeapType>::getOffsetInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+template std::optional<int64_t>
+OpenACCMappableModel<fir::PointerType>::getOffsetInBytes(
+    mlir::Type type, mlir::Value var, mlir::ValueRange accBounds,
+    const mlir::DataLayout &dataLayout) const;
+
+static llvm::SmallVector<mlir::Value>
+generateSeqTyAccBounds(fir::SequenceType seqType, mlir::Value var,
+                       mlir::OpBuilder &builder) {
   assert((mlir::isa<mlir::acc::PointerLikeType>(var.getType()) ||
           mlir::isa<mlir::acc::MappableType>(var.getType())) &&
          "must be pointer-like or mappable");
-
   fir::FirOpBuilder firBuilder(builder, var.getDefiningOp());
-  auto seqType = mlir::cast<fir::SequenceType>(type);
   mlir::Location loc = var.getLoc();
 
-  mlir::Value varPtr =
-      mlir::isa<mlir::acc::PointerLikeType>(var.getType())
-          ? var
-          : mlir::cast<mlir::acc::MappableType>(var.getType()).getVarPtr(var);
-
   if (seqType.hasDynamicExtents() || seqType.hasUnknownShape()) {
     if (auto boxAddr =
-            mlir::dyn_cast_if_present<fir::BoxAddrOp>(varPtr.getDefiningOp())) {
+            mlir::dyn_cast_if_present<fir::BoxAddrOp>(var.getDefiningOp())) {
       mlir::Value box = boxAddr.getVal();
       auto res =
           hlfir::translateToExtendedValue(loc, firBuilder, hlfir::Entity(box));
       fir::ExtendedValue exv = res.first;
       mlir::Value boxRef = box;
-      if (auto boxPtr = getPtrFromVar(box)) {
+      if (auto boxPtr = mlir::cast<mlir::acc::MappableType>(box.getType())
+                            .getVarPtr(box)) {
         boxRef = boxPtr;
       }
       // TODO: Handle Fortran optional.
@@ -189,7 +221,7 @@ OpenACCMappableModel<fir::SequenceType>::generateAccBounds(
           firBuilder, loc, exv, info);
     }
 
-    if (mlir::isa<hlfir::DeclareOp, fir::DeclareOp>(varPtr.getDefiningOp())) {
+    if (mlir::isa<hlfir::DeclareOp, fir::DeclareOp>(var.getDefiningOp())) {
       mlir::Value zero =
           firBuilder.createIntegerConstant(loc, builder.getIndexType(), 0);
       mlir::Value one =
@@ -197,10 +229,10 @@ OpenACCMappableModel<fir::SequenceType>::generateAccBounds(
 
       mlir::Value shape;
       if (auto declareOp =
-              mlir::dyn_cast_if_present<fir::DeclareOp>(varPtr.getDefiningOp()))
+              mlir::dyn_cast_if_present<fir::DeclareOp>(var.getDefiningOp()))
         shape = declareOp.getShape();
       else if (auto declareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
-                   varPtr.getDefiningOp()))
+                   var.getDefiningOp()))
         shape = declareOp.getShape();
 
       const bool strideIncludeLowerExtent = true;
@@ -265,9 +297,9 @@ OpenACCMappableModel<fir::SequenceType>::generateAccBounds(
 
   // TODO: Detect assumed-size case.
   const bool isAssumedSize = false;
-  auto valToCheck = varPtr;
+  auto valToCheck = var;
   if (auto boxAddr =
-          mlir::dyn_cast_if_present<fir::BoxAddrOp>(varPtr.getDefiningOp())) {
+          mlir::dyn_cast_if_present<fir::BoxAddrOp>(var.getDefiningOp())) {
     valToCheck = boxAddr.getVal();
   }
   auto res = hlfir::translateToExtendedValue(loc, firBuilder,
@@ -279,86 +311,34 @@ OpenACCMappableModel<fir::SequenceType>::generateAccBounds(
       /*isAssumedSize=*/isAssumedSize);
 }
 
-template <>
+template <typename Ty>
 llvm::SmallVector<mlir::Value>
-OpenACCMappableModel<fir::BaseBoxType>::generateAccBounds(
-    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const {
-  // If we have a box value instead of box reference, the intent is to
-  // get the bounds of the data not the bounds of the box itself.
-  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(var.getType())) {
-    if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(
-            fir::unwrapRefType(boxTy.getEleTy()))) {
-      mlir::Value data = builder.create<fir::BoxAddrOp>(var.getLoc(), var);
-      return mappableTy.generateAccBounds(data, builder);
-    }
+OpenACCMappableModel<Ty>::generateAccBounds(mlir::Type type, mlir::Value var,
+                                            mlir::OpBuilder &builder) const {
+  // acc bounds only make sense for arrays - thus look for sequence type.
+  mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type);
+  if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(eleTy)) {
+    return generateSeqTyAccBounds(seqTy, var, builder);
   }
-  // Box references are not arrays - thus generating acc.bounds does not make
-  // sense.
-  return {};
-}
-
-static bool isScalarLike(mlir::Type type) {
-  return fir::isa_trivial(type) || fir::isa_ref_type(type);
-}
-
-static bool isArrayLike(mlir::Type type) {
-  return mlir::isa<fir::SequenceType>(type);
-}
 
-static bool isCompositeLike(mlir::Type type) {
-  // class(*) is not a composite type since it does not have a determined type.
-  if (fir::isUnlimitedPolymorphicType(type))
-    return false;
-
-  return mlir::isa<fir::RecordType, fir::ClassType, mlir::TupleType>(type);
-}
-
-template <>
-mlir::acc::VariableTypeCategory
-OpenACCMappableModel<fir::SequenceType>::getTypeCategory(
-    mlir::Type type, mlir::Value var) const {
-  return mlir::acc::VariableTypeCategory::array;
+  return {};
 }
 
-template <>
-mlir::acc::VariableTypeCategory
-OpenACCMappableModel<fir::BaseBoxType>::getTypeCategory(mlir::Type type,
-                                                        mlir::Value var) const {
-  // Class-type does not behave like a normal box because it does not hold an
-  // element type. Thus special handle it here.
-  if (mlir::isa<fir::ClassType>(type)) {
-    // class(*) is not a composite type since it does not have a determined
-    // type.
-    if (fir::isUnlimitedPolymorphicType(type))
-      return mlir::acc::VariableTypeCategory::uncategorized;
-    return mlir::acc::VariableTypeCategory::composite;
-  }
-
-  mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type);
-  assert(eleTy && "expect to be able to unwrap the element type");
+template llvm::SmallVector<mlir::Value>
+OpenACCMappableModel<fir::BaseBoxType>::generateAccBounds(
+    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const;
 
-  // If the type enclosed by the box is a mappable type, then have it
-  // provide the type category.
-  if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(eleTy))
-    return mappableTy.getTypeCategory(var);
+template llvm::SmallVector<mlir::Value>
+OpenACCMappableModel<fir::ReferenceType>::generateAccBounds(
+    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const;
 
-  // For all arrays, despite whether they are allocatable, pointer, assumed,
-  // etc, we'd like to categorize them as "array".
-  if (isArrayLike(eleTy))
-    return mlir::acc::VariableTypeCategory::array;
-
-  // We got here because we don't have an array nor a mappable type. At this
-  // point, we know we have a type that fits the "aggregate" definition since it
-  // is a type with a descriptor. Try to refine it by checking if it matches the
-  // "composite" definition.
-  if (isCompositeLike(eleTy))
-    return mlir::acc::VariableTypeCategory::composite;
+template llvm::SmallVector<mlir::Value>
+OpenACCMappableModel<fir::HeapType>::generateAccBounds(
+    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const;
 
-  // Even if we have a scalar type - simply because it is wrapped in a box
-  // we want to categorize it as "nonscalar". Anything else would've been
-  // non-scalar anyway.
-  return mlir::acc::VariableTypeCategory::nonscalar;
-}
+template llvm::SmallVector<mlir::Value>
+OpenACCMappableModel<fir::PointerType>::generateAccBounds(
+    mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const;
 
 static mlir::Value
 getBaseRef(mlir::TypedValue<mlir::acc::PointerLikeType> varPtr) {
@@ -389,33 +369,44 @@ getBaseRef(mlir::TypedValue<mlir::acc::PointerLikeType> varPtr) {
   return baseRef;
 }
 
-static mlir::acc::VariableTypeCategory
-categorizePointee(mlir::Type pointer,
-                  mlir::TypedValue<mlir::acc::PointerLikeType> varPtr,
-                  mlir::Type varType) {
-  // FIR uses operations to compute interior pointers.
-  // So for example, an array element or composite field access to a float
-  // value would both be represented as !fir.ref<f32>. We do not want to treat
-  // such a reference as a scalar. Thus unwrap interior pointer calculations.
-  auto baseRef = getBaseRef(varPtr);
+static bool isScalarLike(mlir::Type type) {
+  return fir::isa_trivial(type) || fir::isa_ref_type(type);
+}
 
-  if (auto mappableTy =
-          mlir::dyn_cast<mlir::acc::MappableType>(baseRef.getType()))
-    return mappableTy.getTypeCategory(baseRef);
+static bool isArrayLike(mlir::Type type) {
+  return mlir::isa<fir::SequenceType>(type);
+}
 
-  // It must be a pointer-like type since it is not a MappableType.
-  auto ptrLikeTy = mlir::cast<mlir::acc::PointerLikeType>(baseRef.getType());
-  mlir::Type eleTy = ptrLikeTy.getElementType();
+static bool isCompositeLike(mlir::Type type) {
+  // class(*) is not a composite type since it does not have a determined type.
+  if (fir::isUnlimitedPolymorphicType(type))
+    return false;
 
-  if (auto mappableEleTy = mlir::dyn_cast<mlir::acc::MappableType>(eleTy))
-    return mappableEleTy.getTypeCategory(varPtr);
+  return mlir::isa<fir::RecordType, fir::ClassType, mlir::TupleType>(type);
+}
 
-  if (isScalarLike(eleTy))
-    return mlir::acc::VariableTypeCategory::scalar;
+static mlir::acc::VariableTypeCategory
+categorizeElemType(mlir::Type enclosingTy, mlir::Type eleTy, mlir::Value var) {
+  // If the type enclosed is a mappable type, then have it provide the type
+  // category.
+  if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(eleTy))
+    return mappableTy.getTypeCategory(var);
+
+  // For all arrays, despite whether they are allocatable, pointer, assumed,
+  // etc, we'd like to categorize them as "array".
   if (isArrayLike(eleTy))
     return mlir::acc::VariableTypeCategory::array;
+
   if (isCompositeLike(eleTy))
     return mlir::acc::VariableTypeCategory::composite;
+  if (mlir::isa<fir::BoxType>(enclosingTy)) {
+    // Even if we have a scalar type - simply because it is wrapped in a box
+    // we want to categorize it as "nonscalar". Anything else would've been
+    // non-scalar anyway.
+    return mlir::acc::VariableTypeCategory::nonscalar;
+  }
+  if (isScalarLike(eleTy))
+    return mlir::acc::VariableTypeCategory::scalar;
   if (mlir::isa<fir::CharacterType, mlir::FunctionType>(eleTy))
     return mlir::acc::VariableTypeCategory::nonscalar;
   // Assumed-type (type(*))does not have a determined type that can be
@@ -431,6 +422,77 @@ categorizePointee(mlir::Type pointer,
   return mlir::acc::VariableTypeCategory::uncategorized;
 }
 
+template <typename Ty>
+mlir::acc::VariableTypeCategory
+OpenACCMappableModel<Ty>::getTypeCategory(mlir::Type type,
+                                          mlir::Value var) const {
+  // FIR uses operations to compute interior pointers.
+  // So for example, an array element or composite field access to a float
+  // value would both be represented as !fir.ref<f32>. We do not want to treat
+  // such a reference as a scalar. Thus unwrap interior pointer calculations.
+  mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type);
+  if (eleTy && isScalarLike(eleTy)) {
+    if (auto ptrLikeVar = mlir::dyn_cast_if_present<
+            mlir::TypedValue<mlir::acc::PointerLikeType>>(var)) {
+      auto baseRef = getBaseRef(ptrLikeVar);
+      if (baseRef != var) {
+        type = baseRef.getType();
+        if (auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(type))
+          return mappableTy.getTypeCategory(baseRef);
+      }
+    }
+  }
+
+  // Class-type does not behave like a normal box because it does not hold an
+  // element type. Thus special handle it here.
+  if (mlir::isa<fir::ClassType>(type)) {
+    // class(*) is not a composite type since it does not have a determined
+    // type.
+    if (fir::isUnlimitedPolymorphicType(type))
+      return mlir::acc::VariableTypeCategory::uncategorized;
+    return mlir::acc::VariableTypeCategory::composite;
+  }
+
+  assert(eleTy && "expect to be able to unwrap the element type");
+  return categorizeElemType(type, eleTy, var);
+}
+
+template mlir::acc::VariableTypeCategory
+OpenACCMappableModel<fir::BaseBoxType>::getTypeCategory(mlir::Type type,
+                                                        mlir::Value var) const;
+
+template mlir::acc::VariableTypeCategory
+OpenACCMappableModel<fir::ReferenceType>::getTypeCategory(
+    mlir::Type type, mlir::Value var) const;
+
+template mlir::acc::VariableTypeCategory
+OpenACCMappableModel<fir::HeapType>::getTypeCategory(mlir::Type type,
+                                                     mlir::Value var) const;
+
+template mlir::acc::VariableTypeCategory
+OpenACCMappableModel<fir::PointerType>::getTypeCategory(mlir::Type type,
+                                                        mlir::Value var) const;
+
+static mlir::acc::VariableTypeCategory
+categorizePointee(mlir::Type pointer,
+                  mlir::TypedValue<mlir::acc::PointerLikeType> varPtr,
+                  mlir::Type varType) {
+  // FIR uses operations to compute interior pointers.
+  // So for example, an array element or composite field access to a float
+  // value would both be represented as !fir.ref<f32>. We do not want to treat
+  // such a reference as a scalar. Thus unwrap interior pointer calculations.
+  auto baseRef = getBaseRef(varPtr);
+
+  if (auto mappableTy =
+          mlir::dyn_cast<mlir::acc::MappableType>(baseRef.getType()))
+    return mappableTy.getTypeCategory(baseRef);
+
+  // It must be a pointer-like type since it is not a MappableType.
+  auto ptrLikeTy = mlir::cast<mlir::acc::PointerLikeType>(baseRef.getType());
+  mlir::Type eleTy = ptrLikeTy.getElementType();
+  return categorizeElemType(pointer, eleTy, varPtr);
+}
+
 template <>
 mlir::acc::VariableTypeCategory
 OpenACCPointerLikeModel<fir::ReferenceType>::getPointeeTypeCategory(
diff --git a/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp
index 5f174ad4b40fe..869f9c2429aa0 100644
--- a/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp
+++ b/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp
@@ -19,11 +19,14 @@ namespace fir::acc {
 void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
   registry.addExtension(+[](mlir::MLIRContext *ctx,
                             fir::FIROpsDialect *dialect) {
-    fir::SequenceType::attachInterface<OpenACCMappableModel<fir::SequenceType>>(
-        *ctx);
     fir::BoxType::attachInterface<OpenACCMappableModel<fir::BaseBoxType>>(*ctx);
     fir::ClassType::attachInterface<OpenACCMappableModel<fir::BaseBoxType>>(
         *ctx);
+    fir::ReferenceType::attachInterface<
+        OpenACCMappableModel<fir::ReferenceType>>(*ctx);
+    fir::PointerType::attachInterface<OpenACCMappableModel<fir::PointerType>>(
+        *ctx);
+    fir::HeapType::attachInterface<OpenACCMappableModel<fir::HeapType>>(*ctx);
 
     fir::ReferenceType::attachInterface<
         OpenACCPointerLikeModel<fir::ReferenceType>>(*ctx);
@@ -31,6 +34,7 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
         OpenACCPointerLikeModel<fir::PointerType>>(*ctx);
     fir::HeapType::attachInterface<OpenACCPointerLikeModel<fir::HeapType>>(
         *ctx);
+
     fir::LLVMPointerType::attachInterface<
         OpenACCPointerLikeModel<fir::LLVMPointerType>>(*ctx);
   });
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 28f6c8bf02813..31076f6eb328f 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -312,11 +312,24 @@ class DoConcurrentConversion
               bool isComposite) const {
     mlir::omp::WsloopOperands wsloopClauseOps;
 
+    auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
+                                           mlir::Region &ompRegion) {
+      if (!firRegion.empty()) {
+        rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
+        auto firYield =
+            mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
+        rewriter.setInsertionPoint(firYield);
+        rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
+                                            firYield.getOperands());
+        rewriter.eraseOp(firYield);
+      }
+    };
+
     // For `local` (and `local_init`) opernads, emit corresponding `private`
     // clauses and attach these clauses to the workshare loop.
-    if (!loop.getLocalOperands().empty())
+    if (!loop.getLocalVars().empty())
       for (auto [op, sym, arg] : llvm::zip_equal(
-               loop.getLocalOperands(),
+               loop.getLocalVars(),
                loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
                loop.getRegionLocalArgs())) {
         auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom<
@@ -326,50 +339,65 @@ class DoConcurrentConversion
           TODO(localizer.getLoc(),
                "local_init conversion is not supported yet");
 
-        auto oldIP = rewriter.saveInsertionPoint();
+        mlir::OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointAfter(localizer);
+
         auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
             localizer.getLoc(), sym.getLeafReference().str() + ".omp",
             localizer.getTypeAttr().getValue(),
             mlir::omp::DataSharingClauseType::Private);
 
-        if (!localizer.getInitRegion().empty()) {
-          rewriter.cloneRegionBefore(localizer.getInitRegion(),
-                                     privatizer.getInitRegion(),
-                                     privatizer.getInitRegion().begin());
-          auto firYield = mlir::cast<fir::YieldOp>(
-              privatizer.getInitRegion().back().getTerminator());
-          rewriter.setInsertionPoint(firYield);
-          rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
-                                              firYield.getOperands());
-          rewriter.eraseOp(firYield);
-        }
-
-        if (!localizer.getDeallocRegion().empty()) {
-          rewriter.cloneRegionBefore(localizer.getDeallocRegion(),
-                                     privatizer.getDeallocRegion(),
-                                     privatizer.getDeallocRegion().begin());
-          auto firYield = mlir::cast<fir::YieldOp>(
-              privatizer.getDeallocRegion().back().getTerminator());
-          rewriter.setInsertionPoint(firYield);
-          rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
-                                              firYield.getOperands());
-          rewriter.eraseOp(firYield);
-        }
-
-        rewriter.restoreInsertionPoint(oldIP);
+        cloneFIRRegionToOMP(localizer.getInitRegion(),
+                            privatizer.getInitRegion());
+        cloneFIRRegionToOMP(localizer.getDeallocRegion(),
+                            privatizer.getDeallocRegion());
 
         wsloopClauseOps.privateVars.push_back(op);
         wsloopClauseOps.privateSyms.push_back(
             mlir::SymbolRefAttr::get(privatizer));
       }
 
+    if (!loop.getReduceVars().empty()) {
+      for (auto [op, byRef, sym, arg] : llvm::zip_equal(
+               loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(),
+               loop.getReduceSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+               loop.getRegionReduceArgs())) {
+        auto firReducer =
+            mlir::SymbolTable::lookupNearestSymbolFrom<fir::DeclareReductionOp>(
+                loop, sym);
+
+        mlir::OpBuilder::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointAfter(firReducer);
+
+        auto ompReducer = rewriter.create<mlir::omp::DeclareReductionOp>(
+            firReducer.getLoc(), sym.getLeafReference().str() + ".omp",
+            firReducer.getTypeAttr().getValue());
+
+        cloneFIRRegionToOMP(firReducer.getAllocRegion(),
+                            ompReducer.getAllocRegion());
+        cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
+                            ompReducer.getInitializerRegion());
+        cloneFIRRegionToOMP(firReducer.getReductionRegion(),
+                            ompReducer.getReductionRegion());
+        cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
+                            ompReducer.getAtomicReductionRegion());
+        cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
+                            ompReducer.getCleanupRegion());
+
+        wsloopClauseOps.reductionVars.push_back(op);
+        wsloopClauseOps.reductionByref.push_back(byRef);
+        wsloopClauseOps.reductionSyms.push_back(
+            mlir::SymbolRefAttr::get(ompReducer));
+      }
+    }
+
     auto wsloopOp =
         rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(isComposite);
 
     Fortran::common::openmp::EntryBlockArgs wsloopArgs;
     wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
+    wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
                                            wsloopOp.getRegion());
 
@@ -393,7 +421,8 @@ class DoConcurrentConversion
                              clauseOps.loopLowerBounds.size())))
       rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
 
-    for (unsigned i = 0; i < loop.getLocalVars().size(); ++i)
+    for (unsigned i = 0;
+         i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
       loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
 
     return loopNestOp;
diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
index e440852b3103a..506c8e66dbdfa 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
@@ -234,6 +234,10 @@ class DoConcurrentConversion
       loop.setLocalSymsAttr(nullptr);
     }
 
+    for (auto [reduceVar, reduceArg] :
+         llvm::zip_equal(loop.getReduceVars(), loop.getRegionReduceArgs()))
+      rewriter.replaceAllUsesWith(reduceArg, reduceVar);
+
     // Collect iteration variable(s) allocations so that we can move them
     // outside the `fir.do_concurrent` wrapper.
     llvm::SmallVector<mlir::Operation *> opsToMove;
@@ -257,12 +261,16 @@ class DoConcurrentConversion
       innermostUnorderdLoop = rewriter.create<fir::DoLoopOp>(
           doConcurentOp.getLoc(), lb, ub, st,
           /*unordred=*/true, /*finalCountValue=*/false,
-          /*iterArgs=*/std::nullopt, loop.getReduceOperands(),
+          /*iterArgs=*/std::nullopt, loop.getReduceVars(),
           loop.getReduceAttrsAttr());
       ivArgs.push_back(innermostUnorderdLoop.getInductionVar());
       rewriter.setInsertionPointToStart(innermostUnorderdLoop.getBody());
     }
 
+    loop.getRegion().front().eraseArguments(loop.getNumInductionVars() +
+                                                loop.getNumLocalOperands(),
+                                            loop.getNumReduceOperands());
+
     rewriter.inlineBlockBefore(
         &loopBlock, innermostUnorderdLoop.getBody()->getTerminator(), ivArgs);
     rewriter.eraseOp(doConcurentOp);
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 3016ce4ccd2f8..d70aaab82cbab 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1276,6 +1276,58 @@ struct OmpEndDirectiveParser {
   llvm::omp::Directive dir_;
 };
 
+struct OmpAllocatorsConstructParser {
+  using resultType = OpenMPAllocatorsConstruct;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    auto dirSpec{Parser<OmpDirectiveSpecification>{}.Parse(state)};
+    if (!dirSpec || dirSpec->DirId() != llvm::omp::Directive::OMPD_allocators) {
+      return std::nullopt;
+    }
+
+    // This should be an allocate-stmt. That will be checked in semantics.
+    Block block;
+    if (auto stmt{attempt(Parser<ExecutionPartConstruct>{}).Parse(state)}) {
+      block.emplace_back(std::move(*stmt));
+    }
+    // Allow empty block. Check for this in semantics.
+
+    auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_allocators}};
+    return OpenMPAllocatorsConstruct{
+        std::move(*dirSpec), std::move(block), *maybe(end).Parse(state)};
+  }
+};
+
+TYPE_PARSER(sourced( //
+    construct<OpenMPAllocatorsConstruct>(
+        "ALLOCATORS"_tok >= OmpAllocatorsConstructParser{})))
+
+struct OmpDispatchConstructParser {
+  using resultType = OpenMPDispatchConstruct;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    auto dirSpec{Parser<OmpDirectiveSpecification>{}.Parse(state)};
+    if (!dirSpec || dirSpec->DirId() != llvm::omp::Directive::OMPD_dispatch) {
+      return std::nullopt;
+    }
+
+    // This should be a function call. That will be checked in semantics.
+    Block block;
+    if (auto stmt{attempt(Parser<ExecutionPartConstruct>{}).Parse(state)}) {
+      block.emplace_back(std::move(*stmt));
+    }
+    // Allow empty block. Check for this in semantics.
+
+    auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_dispatch}};
+    return OpenMPDispatchConstruct{
+        std::move(*dirSpec), std::move(block), *maybe(end).Parse(state)};
+  }
+};
+
+TYPE_PARSER(sourced( //
+    construct<OpenMPDispatchConstruct>(
+        "DISPATCH"_tok >= OmpDispatchConstructParser{})))
+
 // Parser for an arbitrary OpenMP ATOMIC construct.
 //
 // Depending on circumstances, an ATOMIC construct applies to one or more
@@ -1605,16 +1657,6 @@ TYPE_PARSER(sourced(construct<OmpCriticalDirective>(verbatim("CRITICAL"_tok),
 TYPE_PARSER(construct<OpenMPCriticalConstruct>(
     Parser<OmpCriticalDirective>{}, block, Parser<OmpEndCriticalDirective>{}))
 
-TYPE_PARSER(sourced(construct<OmpDispatchDirective>(
-    verbatim("DISPATCH"_tok), Parser<OmpClauseList>{})))
-
-TYPE_PARSER(
-    construct<OmpEndDispatchDirective>(startOmpLine >> "END DISPATCH"_tok))
-
-TYPE_PARSER(sourced(construct<OpenMPDispatchConstruct>(
-    Parser<OmpDispatchDirective>{} / endOmpLine, block,
-    maybe(Parser<OmpEndDispatchDirective>{} / endOmpLine))))
-
 // 2.11.3 Executable Allocate directive
 TYPE_PARSER(
     sourced(construct<OpenMPExecutableAllocate>(verbatim("ALLOCATE"_tok),
@@ -1622,16 +1664,6 @@ TYPE_PARSER(
         maybe(nonemptyList(Parser<OpenMPDeclarativeAllocate>{})) / endOmpLine,
         statement(allocateStmt))))
 
-// 6.7 Allocators construct [OpenMP 5.2]
-//     allocators-construct -> ALLOCATORS [allocate-clause [,]]
-//                                allocate-stmt
-//                             [omp-end-allocators-construct]
-TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>(
-    verbatim("ALLOCATORS"_tok), Parser<OmpClauseList>{} / endOmpLine,
-    statement(allocateStmt), maybe(Parser<OmpEndAllocators>{} / endOmpLine))))
-
-TYPE_PARSER(construct<OmpEndAllocators>(startOmpLine >> "END ALLOCATORS"_tok))
-
 // 2.8.2 Declare Simd construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>(
     verbatim("DECLARE SIMD"_tok) || verbatim("DECLARE_SIMD"_tok),
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index dcd1ac165adc1..b66d756bdbf2c 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2571,7 +2571,7 @@ class UnparseVisitor {
     Word(ToUpperCaseLetters(common::EnumToString(x)));
   }
 
-  void Unparse(const OpenMPAtomicConstruct &x) {
+  template <typename Construct> void UnparseBlockConstruct(const Construct &x) {
     BeginOpenMP();
     Word("!$OMP ");
     Walk(std::get<OmpDirectiveSpecification>(x.t));
@@ -2587,6 +2587,10 @@ class UnparseVisitor {
     }
   }
 
+  void Unparse(const OpenMPAtomicConstruct &x) { //
+    UnparseBlockConstruct(x);
+  }
+
   void Unparse(const OpenMPExecutableAllocate &x) {
     const auto &fields =
         std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
@@ -2614,22 +2618,8 @@ class UnparseVisitor {
     Put("\n");
     EndOpenMP();
   }
-  void Unparse(const OmpEndAllocators &x) {
-    BeginOpenMP();
-    Word("!$OMP END ALLOCATE");
-    Put("\n");
-    EndOpenMP();
-  }
-  void Unparse(const OpenMPAllocatorsConstruct &x) {
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AllocateStmt>>(x.t));
-    if (const auto &end = std::get<std::optional<OmpEndAllocators>>(x.t)) {
-      Walk(*end);
-    }
+  void Unparse(const OpenMPAllocatorsConstruct &x) { //
+    UnparseBlockConstruct(x);
   }
   void Unparse(const OmpAssumeDirective &x) {
     BeginOpenMP();
@@ -2768,6 +2758,9 @@ class UnparseVisitor {
     Put("\n");
     EndOpenMP();
   }
+  void Unparse(const OpenMPDispatchConstruct &x) { //
+    UnparseBlockConstruct(x);
+  }
   void Unparse(const OpenMPRequiresConstruct &y) {
     BeginOpenMP();
     Word("!$OMP REQUIRES ");
@@ -2787,15 +2780,6 @@ class UnparseVisitor {
     Walk(x.v);
     return false;
   }
-  void Unparse(const OmpDispatchDirective &x) {
-    Word("!$OMP DISPATCH");
-    Walk(x.t);
-    Put("\n");
-  }
-  void Unparse(const OmpEndDispatchDirective &) {
-    Word("!$OMP END DISPATCH");
-    Put("\n");
-  }
   void Unparse(const OmpErrorDirective &x) {
     Word("!$OMP ERROR ");
     Walk(x.t);
diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp
index 047c604693460..c5ed8796f0c34 100644
--- a/flang/lib/Semantics/check-omp-atomic.cpp
+++ b/flang/lib/Semantics/check-omp-atomic.cpp
@@ -47,43 +47,12 @@ static bool operator!=(const evaluate::Expr<T> &e, const evaluate::Expr<U> &f) {
   return !(e == f);
 }
 
-// There is no consistent way to get the source of a given ActionStmt, so
-// extract the source information from Statement<ActionStmt> when we can,
-// and keep it around for error reporting in further analyses.
-struct SourcedActionStmt {
-  const parser::ActionStmt *stmt{nullptr};
-  parser::CharBlock source;
-
-  operator bool() const { return stmt != nullptr; }
-};
-
 struct AnalyzedCondStmt {
   SomeExpr cond{evaluate::NullPointer{}}; // Default ctor is deleted
   parser::CharBlock source;
   SourcedActionStmt ift, iff;
 };
 
-static SourcedActionStmt GetActionStmt(
-    const parser::ExecutionPartConstruct *x) {
-  if (x == nullptr) {
-    return SourcedActionStmt{};
-  }
-  if (auto *exec{std::get_if<parser::ExecutableConstruct>(&x->u)}) {
-    using ActionStmt = parser::Statement<parser::ActionStmt>;
-    if (auto *stmt{std::get_if<ActionStmt>(&exec->u)}) {
-      return SourcedActionStmt{&stmt->statement, stmt->source};
-    }
-  }
-  return SourcedActionStmt{};
-}
-
-static SourcedActionStmt GetActionStmt(const parser::Block &block) {
-  if (block.size() == 1) {
-    return GetActionStmt(&block.front());
-  }
-  return SourcedActionStmt{};
-}
-
 // Compute the `evaluate::Assignment` from parser::ActionStmt. The assumption
 // is that the ActionStmt will be either an assignment or a pointer-assignment,
 // otherwise return std::nullopt.
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 89c1565bf66aa..2425265e196c6 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -495,6 +495,12 @@ template <typename Checker> struct DirectiveSpellingVisitor {
   template <typename T> bool Pre(const T &) { return true; }
   template <typename T> void Post(const T &) {}
 
+  template <typename... Ts>
+  static const parser::OmpDirectiveName &GetDirName(
+      const std::tuple<Ts...> &t) {
+    return std::get<parser::OmpDirectiveSpecification>(t).DirName();
+  }
+
   bool Pre(const parser::OmpSectionsDirective &x) {
     checker_(x.source, x.v);
     return false;
@@ -503,8 +509,8 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
     return false;
   }
-  bool Pre(const parser::OmpDispatchDirective &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_dispatch);
+  bool Pre(const parser::OpenMPDispatchConstruct &x) {
+    checker_(GetDirName(x.t).source, Directive::OMPD_dispatch);
     return false;
   }
   bool Pre(const parser::OmpErrorDirective &x) {
@@ -520,8 +526,7 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     return false;
   }
   bool Pre(const parser::OpenMPAllocatorsConstruct &x) {
-    checker_(
-        std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocators);
+    checker_(GetDirName(x.t).source, Directive::OMPD_allocators);
     return false;
   }
   bool Pre(const parser::OmpAssumeDirective &x) {
@@ -1590,28 +1595,31 @@ void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) {
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPDispatchConstruct &x) {
-  PushContextAndClauseSets(x.source, llvm::omp::Directive::OMPD_dispatch);
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
   const auto &block{std::get<parser::Block>(x.t)};
-  if (block.empty() || block.size() > 1) {
+  PushContextAndClauseSets(
+      dirSpec.DirName().source, llvm::omp::Directive::OMPD_dispatch);
+
+  if (block.empty()) {
     context_.Say(x.source,
-        "The DISPATCH construct is empty or contains more than one statement"_err_en_US);
+        "The DISPATCH construct should contain a single function or subroutine call"_err_en_US);
     return;
   }
 
-  auto it{block.begin()};
   bool passChecks{false};
-  if (const parser::AssignmentStmt *
-      assignStmt{parser::Unwrap<parser::AssignmentStmt>(*it)}) {
+  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
+  if (const auto *assignStmt{
+          parser::Unwrap<parser::AssignmentStmt>(*action.stmt)}) {
     if (parser::Unwrap<parser::FunctionReference>(assignStmt->t)) {
       passChecks = true;
     }
-  } else if (parser::Unwrap<parser::CallStmt>(*it)) {
+  } else if (parser::Unwrap<parser::CallStmt>(*action.stmt)) {
     passChecks = true;
   }
 
   if (!passChecks) {
-    context_.Say(x.source,
-        "The DISPATCH construct does not contain a SUBROUTINE or FUNCTION"_err_en_US);
+    context_.Say(action.source,
+        "The body of the DISPATCH construct should be a function or a subroutine call"_err_en_US);
   }
 }
 
@@ -1657,26 +1665,45 @@ void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
 
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
   isPredefinedAllocator = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocators);
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
+
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &block{std::get<parser::Block>(x.t)};
+  PushContextAndClauseSets(
+      dirSpec.DirName().source, llvm::omp::Directive::OMPD_allocators);
+
+  if (block.empty()) {
+    context_.Say(dirSpec.source,
+        "The ALLOCATORS construct should contain a single ALLOCATE statement"_err_en_US);
+    return;
+  }
+
+  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
+  const auto *allocate{
+      action ? parser::Unwrap<parser::AllocateStmt>(action.stmt) : nullptr};
+
+  if (!allocate) {
+    const parser::CharBlock &source = action ? action.source : x.source;
+    context_.Say(source,
+        "The body of the ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
+  }
+
+  for (const auto &clause : dirSpec.Clauses().v) {
     if (const auto *allocClause{
             parser::Unwrap<parser::OmpClause::Allocate>(clause)}) {
       CheckVarIsNotPartOfAnotherVar(
-          dir.source, std::get<parser::OmpObjectList>(allocClause->v.t));
+          dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t));
     }
   }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPAllocatorsConstruct &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+
+  for (const auto &clause : dirSpec.Clauses().v) {
     if (const auto *allocClause{
             std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
       CheckPredefinedAllocatorRestriction(
-          dir.source, std::get<parser::OmpObjectList>(allocClause->v.t));
+          dirSpec.source, std::get<parser::OmpObjectList>(allocClause->v.t));
     }
   }
   dirContext_.pop_back();
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index fd9596a09cd52..f43d2cc75620e 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -38,6 +38,26 @@
 
 namespace Fortran::semantics::omp {
 
+SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
+  if (x == nullptr) {
+    return SourcedActionStmt{};
+  }
+  if (auto *exec{std::get_if<parser::ExecutableConstruct>(&x->u)}) {
+    using ActionStmt = parser::Statement<parser::ActionStmt>;
+    if (auto *stmt{std::get_if<ActionStmt>(&exec->u)}) {
+      return SourcedActionStmt{&stmt->statement, stmt->source};
+    }
+  }
+  return SourcedActionStmt{};
+}
+
+SourcedActionStmt GetActionStmt(const parser::Block &block) {
+  if (block.size() == 1) {
+    return GetActionStmt(&block.front());
+  }
+  return SourcedActionStmt{};
+}
+
 std::string ThisVersion(unsigned version) {
   std::string tv{
       std::to_string(version / 10) + "." + std::to_string(version % 10)};
diff --git a/flang/lib/Semantics/openmp-utils.h b/flang/lib/Semantics/openmp-utils.h
index dbb0565215357..a96c008fb26e7 100644
--- a/flang/lib/Semantics/openmp-utils.h
+++ b/flang/lib/Semantics/openmp-utils.h
@@ -29,6 +29,19 @@ class Symbol;
 
 // Add this namespace to avoid potential conflicts
 namespace omp {
+// There is no consistent way to get the source of an ActionStmt, but there
+// is "source" in Statement<T>. This structure keeps the ActionStmt with the
+// extracted source for further use.
+struct SourcedActionStmt {
+  const parser::ActionStmt *stmt{nullptr};
+  parser::CharBlock source;
+
+  operator bool() const { return stmt != nullptr; }
+};
+
+SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x);
+SourcedActionStmt GetActionStmt(const parser::Block &block);
+
 std::string ThisVersion(unsigned version);
 std::string TryVersion(unsigned version);
 
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 299bb6ff876e7..151f4ccae634e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -10,6 +10,7 @@
 
 #include "check-acc-structure.h"
 #include "check-omp-structure.h"
+#include "openmp-utils.h"
 #include "resolve-names-utils.h"
 #include "flang/Common/idioms.h"
 #include "flang/Evaluate/fold.h"
@@ -353,12 +354,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
-  bool Pre(const parser::OmpDirectiveSpecification &x) {
-    PushContext(x.source, x.DirId());
-    return true;
-  }
-  void Post(const parser::OmpDirectiveSpecification &) { PopContext(); }
-
   bool Pre(const parser::OmpMetadirectiveDirective &x) {
     PushContext(x.source, llvm::omp::Directive::OMPD_metadirective);
     return true;
@@ -372,6 +367,29 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     GetContext().withinConstruct = true;
   }
 
+  bool Pre(const parser::OpenMPStandaloneConstruct &x) {
+    common::visit(
+        [&](auto &&s) {
+          using TypeS = llvm::remove_cvref_t<decltype(s)>;
+          // These two cases are handled individually.
+          if constexpr ( //
+              !std::is_same_v<TypeS, parser::OpenMPSimpleStandaloneConstruct> &&
+              !std::is_same_v<TypeS, parser::OmpMetadirectiveDirective>) {
+            PushContext(x.source, s.v.DirId());
+          }
+        },
+        x.u);
+    return true;
+  }
+
+  void Post(const parser::OpenMPStandaloneConstruct &x) {
+    // These two cases are handled individually.
+    if (!std::holds_alternative<parser::OpenMPSimpleStandaloneConstruct>(x.u) &&
+        !std::holds_alternative<parser::OmpMetadirectiveDirective>(x.u)) {
+      PopContext();
+    }
+  }
+
   bool Pre(const parser::OpenMPSimpleStandaloneConstruct &);
   void Post(const parser::OpenMPSimpleStandaloneConstruct &) { PopContext(); }
 
@@ -2149,9 +2167,10 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) {
-  PushContext(x.source, llvm::omp::Directive::OMPD_allocators);
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  PushContext(x.source, dirSpec.DirId());
+
+  for (const auto &clause : dirSpec.Clauses().v) {
     if (const auto *allocClause{
             std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
       ResolveOmpObjectList(std::get<parser::OmpObjectList>(allocClause->v.t),
@@ -2234,28 +2253,43 @@ void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
 }
 
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (const auto *alloc{
-            std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
-      CheckAllNamesInAllocateStmt(dir.source,
-          std::get<parser::OmpObjectList>(alloc->v.t),
-          std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement);
-
-      auto &modifiers{OmpGetModifiers(alloc->v)};
-      bool hasAllocator{
-          OmpGetUniqueModifier<parser::OmpAllocatorSimpleModifier>(modifiers) ||
-          OmpGetUniqueModifier<parser::OmpAllocatorComplexModifier>(modifiers)};
-
-      // TODO: As with allocate directive, exclude the case when a requires
-      //       directive with the dynamic_allocators clause is present in
-      //       the same compilation unit (OMP5.0 2.11.3).
-      if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
-          !hasAllocator) {
-        context_.Say(x.source,
-            "ALLOCATORS directives that appear in a TARGET region "
-            "must specify an allocator"_err_en_US);
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &block{std::get<parser::Block>(x.t)};
+
+  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
+  const parser::AllocateStmt *allocate{[&]() {
+    if (action) {
+      if (auto *alloc{std::get_if<common::Indirection<parser::AllocateStmt>>(
+              &action.stmt->u)}) {
+        return &alloc->value();
+      }
+    }
+    return static_cast<const parser::AllocateStmt *>(nullptr);
+  }()};
+
+  if (allocate) {
+    for (const auto &clause : dirSpec.Clauses().v) {
+      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
+        CheckAllNamesInAllocateStmt(
+            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
+
+        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
+        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
+
+        auto &modifiers{OmpGetModifiers(alloc->v)};
+        bool hasAllocator{
+            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
+            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
+
+        // TODO: As with allocate directive, exclude the case when a requires
+        //       directive with the dynamic_allocators clause is present in
+        //       the same compilation unit (OMP5.0 2.11.3).
+        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
+            !hasAllocator) {
+          context_.Say(x.source,
+              "ALLOCATORS directives that appear in a TARGET region "
+              "must specify an allocator"_err_en_US);
+        }
       }
     }
   }
diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir
index 3e3e455469f69..71576f4b71075 100644
--- a/flang/test/Fir/OpenACC/openacc-mappable.fir
+++ b/flang/test/Fir/OpenACC/openacc-mappable.fir
@@ -23,7 +23,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   // CHECK: Size: 40
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "arr", structured = false}
-  // CHECK: Mappable: !fir.array<10xf32>
+  // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
 
@@ -60,20 +60,17 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>,
   }
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr1", structured = false}
-  // CHECK: Pointer-like: !fir.ref<!fir.array<?xf32>>
-  // CHECK: Mappable: !fir.array<?xf32>
+  // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c1{{.*}} : index)
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr2", structured = false}
-  // CHECK: Pointer-like: !fir.ref<!fir.array<?xf32>>
-  // CHECK: Mappable: !fir.array<?xf32>
+  // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<?xf32>>
   // CHECK: Type category: array
   // CHECK: Bound[0]: %{{.*}} = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}} : index) stride(%c1{{.*}} : index) startIdx(%c2{{.*}} : index)
 
   // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "arr3", structured = false}
-  // CHECK: Pointer-like: !fir.ref<!fir.array<10xf32>>
-  // CHECK: Mappable: !fir.array<10xf32>
+  // CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
   // CHECK: Type category: array
   // CHECK: Size: 40
   // CHECK: Offset: 0
diff --git a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
index 58025bfa556a5..e8951cceeeaeb 100644
--- a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
+++ b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
@@ -29,13 +29,13 @@ subroutine init_unlimited(this)
 ! CHECK: Mappable: !fir.class<!fir.type<_QMmmTpolyty{field:f32}>>
 ! CHECK: Type category: composite
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "this%field", structured = false}
-! CHECK: Pointer-like: !fir.ref<f32>
+! CHECK: Pointer-like and Mappable: !fir.ref<f32>
 ! CHECK: Type category: composite
 
 ! For unlimited polymorphic entities and assumed types - they effectively have
 ! no declared type. Thus the type categorizer cannot categorize it.
 ! CHECK: Visiting: {{.*}} = acc.copyin {{.*}} {name = "var", structured = false}
-! CHECK: Pointer-like: !fir.ref<none>
+! CHECK: Pointer-like and Mappable: !fir.ref<none>
 ! CHECK: Type category: uncategorized
 ! CHECK: Visiting: {{.*}} = acc.copyin {{.*}} {name = "this", structured = false}
 ! CHECK: Mappable: !fir.class<none>
diff --git a/flang/test/Fir/OpenACC/openacc-type-categories.f90 b/flang/test/Fir/OpenACC/openacc-type-categories.f90
index c25c38422b755..3d6067db8224d 100644
--- a/flang/test/Fir/OpenACC/openacc-type-categories.f90
+++ b/flang/test/Fir/OpenACC/openacc-type-categories.f90
@@ -18,32 +18,32 @@ program main
 end program
 
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "scalar", structured = false}
-! CHECK: Pointer-like: !fir.ref<f32>
+! CHECK: Pointer-like and Mappable: !fir.ref<f32>
 ! CHECK: Type category: scalar
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "scalaralloc", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.box<!fir.heap<f32>>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.box<!fir.heap<f32>>>
 ! CHECK: Type category: nonscalar
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.type<_QFTtt{field:f32,fieldarray:!fir.array<10xf32>}>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.type<_QFTtt{field:f32,fieldarray:!fir.array<10xf32>}>>
 ! CHECK: Type category: composite
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayconstsize", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.array<10xf32>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
 ! CHECK: Type category: array
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayalloc", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: Type category: array
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "complexvar", structured = false}
-! CHECK: Pointer-like: !fir.ref<complex<f32>>
+! CHECK: Pointer-like and Mappable: !fir.ref<complex<f32>>
 ! CHECK: Type category: scalar
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "charvar", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.char<1>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.char<1>>
 ! CHECK: Type category: nonscalar
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar%field", structured = false}
-! CHECK: Pointer-like: !fir.ref<f32>
+! CHECK: Pointer-like and Mappable: !fir.ref<f32>
 ! CHECK: Type category: composite
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar%fieldarray", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.array<10xf32>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
 ! CHECK: Type category: array
 ! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayconstsize(1)", structured = false}
-! CHECK: Pointer-like: !fir.ref<!fir.array<10xf32>>
+! CHECK: Pointer-like and Mappable: !fir.ref<!fir.array<10xf32>>
 ! CHECK: Type category: array
diff --git a/flang/test/Fir/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir
index cc1197ba56bd7..6e2173447855e 100644
--- a/flang/test/Fir/do_concurrent.fir
+++ b/flang/test/Fir/do_concurrent.fir
@@ -63,7 +63,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
     %j = fir.alloca i32
     fir.do_concurrent.loop
       (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) 
-      reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) {
+      reduce(@add_reduction_i32 #fir.reduce_attr<add> %sum -> %sum_arg : !fir.ref<i32>) {
       %0 = fir.convert %i_iv : (index) -> i32
       fir.store %0 to %i : !fir.ref<i32>
 
@@ -83,7 +83,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
 // CHECK:           %[[I:.*]] = fir.alloca i32
 // CHECK:           %[[J:.*]] = fir.alloca i32
 // CHECK:           fir.do_concurrent.loop
-// CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(#fir.reduce_attr<add> -> %[[SUM]] : !fir.ref<i32>) {
+// CHECK-SAME:        (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(@add_reduction_i32 #fir.reduce_attr<add> %[[SUM]] -> %{{.*}} : !fir.ref<i32>) {
 // CHECK:             %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
 // CHECK:             fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref<i32>
 // CHECK:             %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
@@ -161,3 +161,62 @@ func.func @do_concurrent_with_locality_specs() {
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
+
+func.func @dc_reduce() {
+  %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"}
+  %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index 
+  fir.do_concurrent {
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(byref @add_reduction_i32 #fir.reduce_attr<add> %4#0 -> %arg1 : !fir.ref<i32>) {
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:  func.func @dc_reduce() {
+// CHECK:          %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"}
+// CHECK:          %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:          fir.do_concurrent {
+// CHECK:            fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) reduce(byref @add_reduction_i32 #fir.reduce_attr<add> %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref<i32>) {
+// CHECK:            }
+// CHECK:          }
+// CHECK:          return
+// CHECK:        }
+
+func.func @dc_reduce_2() {
+  %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"}
+  %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  %5 = fir.alloca i32 {bindc_name = "m", uniq_name = "dc_reduce"}
+  %6:2 = hlfir.declare %5 {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  %c1 = arith.constant 1 : index 
+
+  fir.do_concurrent {
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) 
+      reduce(@add_reduction_i32 #fir.reduce_attr<add> %4#0 -> %arg1,
+             @mul_reduction_i32 #fir.reduce_attr<multiply> %6#0 -> %arg2
+             : !fir.ref<i32>, !fir.ref<i32>) {
+    }
+  }
+
+  return
+}
+
+// CHECK-LABEL:  func.func @dc_reduce_2() {
+// CHECK:          %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"}
+// CHECK:          %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK:          %[[M_ALLOC:.*]] = fir.alloca i32 {bindc_name = "m", uniq_name = "dc_reduce"}
+// CHECK:          %[[M_DECL:.*]]:2 = hlfir.declare %[[M_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:          fir.do_concurrent {
+// CHECK:            fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{[^[:space:]]+}}) 
+// CHECK-SAME:         reduce(
+// CHECK-SAME:           @add_reduction_i32 #fir.reduce_attr<add> %[[S_DECL]]#0 -> %[[S_ARG:[^,]+]],
+// CHECK-SAME:           @mul_reduction_i32 #fir.reduce_attr<multiply> %[[M_DECL]]#0 -> %[[M_ARG:[^[:space:]]+]]
+// CHECK-SAME:           : !fir.ref<i32>, !fir.ref<i32>) {
+// CHECK:            }
+// CHECK:          }
+// CHECK:          return
+// CHECK:        }
+
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index aca0ecc1abdc1..e32ea7ad3c729 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1256,8 +1256,8 @@ func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
   %sum = fir.alloca i32
   // expected-error@+2 {{'fir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}}
   fir.do_concurrent {
-    "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>}> ({
-    ^bb0(%arg3: index):
+    "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 1>}> ({
+    ^bb0(%arg3: index, %sum_arg: i32):
       %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array<i32: 0, 0>}> : () -> !fir.ref<i32>
     }) : (index, index, index, !fir.ref<i32>) -> ()
   }
@@ -1266,6 +1266,20 @@ func.func @dc_invalid_reduction(%arg0: index, %arg1: index) {
 
 // -----
 
+func.func @dc_reduce_no_attr() {
+  %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"}
+  %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index 
+  // expected-error@+2 {{expected attribute value}}
+  fir.do_concurrent {
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 %4#0 -> %arg1 : !fir.ref<i32>) {
+    }
+  }
+  return
+}
+
+// -----
+
 // Should fail when volatility changes from a fir.convert
 func.func @bad_convert_volatile(%arg0: !fir.ref<i32>) -> !fir.ref<i32, volatile> {
   // expected-error@+1 {{op this conversion does not preserve volatility}}
diff --git a/flang/test/HLFIR/fir-reduction-alloca-block.fir b/flang/test/HLFIR/fir-reduction-alloca-block.fir
new file mode 100644
index 0000000000000..75857cfbe01d3
--- /dev/null
+++ b/flang/test/HLFIR/fir-reduction-alloca-block.fir
@@ -0,0 +1,31 @@
+// Tests that `fir.local` ops are able to provide an alloca block when required.
+
+// RUN: fir-opt %s -convert-hlfir-to-fir | FileCheck %s
+
+fir.declare_reduction @add_reduction_byref_box_heap_UxUxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> alloc {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+  fir.yield(%0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+} init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, %arg1: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>):
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = fir.load %arg1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+  hlfir.assign %cst to %0 : f32, !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+} combiner {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, %arg1: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>):
+  fir.yield(%arg0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+}
+
+// CHECK-LABEL:   fir.declare_reduction @add_reduction_byref_box_heap_UxUxf32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> alloc {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:           fir.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+
+// CHECK-LABEL:   } init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>):
+// CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:           fir.yield(%[[VAL_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+
+// CHECK-LABEL:   } combiner {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>):
+// CHECK:           fir.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>)
+// CHECK:         }
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 3a9b55996d9b1..3a4aff977b7a5 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -436,3 +436,11 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub22()
 ! CHECK: cuf.data_transfer
+
+subroutine sub23(n)
+  integer :: n
+  real(8), device :: d(n,n), x(n)
+  x = sum(d,dim=2) ! Was triggering Unsupported CUDA data transfer
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub23
diff --git a/flang/test/Lower/do_concurrent_reduce.f90 b/flang/test/Lower/do_concurrent_reduce.f90
new file mode 100644
index 0000000000000..8591a21e2b9e0
--- /dev/null
+++ b/flang/test/Lower/do_concurrent_reduce.f90
@@ -0,0 +1,41 @@
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+
+subroutine do_concurrent_reduce
+  implicit none
+  integer :: s, i
+
+  do concurrent (i=1:10) reduce(+:s)
+    s = s + 1
+  end do
+end
+
+! CHECK-LABEL:  fir.declare_reduction @add_reduction_i32 : i32 init {
+! CHECK:        ^bb0(%[[ARG0:.*]]: i32):
+! CHECK:          %[[VAL_0:.*]] = arith.constant 0 : i32
+! CHECK:          fir.yield(%[[VAL_0]] : i32)
+! CHECK:        } combiner {
+! CHECK:          ^bb0(%[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i32):
+! CHECK:          %[[VAL_3:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : i32
+! CHECK:          fir.yield(%[[VAL_3]] : i32)
+! CHECK:        }
+
+! CHECK-LABEL:  func.func @_QPdo_concurrent_reduce() {
+! CHECK:           %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+! CHECK:           %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+! CHECK:           fir.do_concurrent {
+! CHECK:             %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK:             %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{[^[:space:]]+}}) 
+! CHECK-SAME:          reduce(@add_reduction_i32 #fir.reduce_attr<add> %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref<i32>) {
+
+! CHECK:               %[[S_ARG_DECL:.*]]:2 = hlfir.declare %[[S_ARG]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[S_ARG_VAL:.*]] = fir.load %[[S_ARG_DECL]]#0 : !fir.ref<i32>
+! CHECK:               %[[C1:.*]] = arith.constant 1 : i32
+! CHECK:               %[[RED_UPDATE:.*]] = arith.addi %[[S_ARG_VAL]], %[[C1]] : i32
+! CHECK:               hlfir.assign %[[RED_UPDATE]] to %[[S_ARG_DECL]]#0 : i32, !fir.ref<i32>
+
+! CHECK:             }
+! CHECK:           }
+! CHECK:           return
+! CHECK:        }
diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
new file mode 100644
index 0000000000000..873fd10dd1b97
--- /dev/null
+++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+
+subroutine do_concurrent_allocatable
+  integer :: i
+  real, allocatable, dimension(:,:) :: x
+
+  do concurrent (i = 1:10) reduce(+: x)
+  end do
+end subroutine
+
+! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc {
+! CHECK:   %[[ALLOC:.*]] = fir.alloca
+! CHECK:   fir.yield(%[[ALLOC]] : ![[RED_TYPE]])
+! CHECK: } init {
+! CHECK: ^bb0(%{{.*}}: ![[RED_TYPE]], %[[RED_ARG:.*]]: ![[RED_TYPE]]):
+! CHECK:   fir.yield(%[[RED_ARG]] : !{{.*}})
+! CHECK: } combiner {
+! CHECK: ^bb0(%[[COMB_RES:.*]]: ![[RED_TYPE]], %{{.*}}: ![[RED_TYPE]]):
+! CHECK:   fir.yield(%[[COMB_RES]] : !{{.*}})
+! CHECK: } cleanup {
+! CHECK:   fir.yield
+! CHECK: }
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc3..64f14ff972272 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca16..2965b954b49a8 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -12,7 +12,7 @@ subroutine loop_test
 
   ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"}
   ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref<i32>
-  ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr<add> -> %[[VAL_1:.*]] : !fir.ref<i32>, #fir.reduce_attr<max> -> %[[VAL_0:.*]] : !fir.ref<f32>) {
+  ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(@add_reduction_i32 #fir.reduce_attr<add> %[[VAL_1]] -> %{{.*}}, @other_reduction_f32 #fir.reduce_attr<max> %[[VAL_0]] -> %{{.*}} : {{.*}}) {
   do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m)
     tmp = i + j + k
     sum = tmp + sum
diff --git a/flang/test/Parser/OpenMP/allocators-unparse.f90 b/flang/test/Parser/OpenMP/allocators-unparse.f90
index 5cd0230471fc4..70feb7a6b527e 100644
--- a/flang/test/Parser/OpenMP/allocators-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocators-unparse.f90
@@ -15,48 +15,62 @@ subroutine allocate()
 
   !$omp allocators allocate(align(32): arr2)
     allocate(arr2(5, 3))
+  !$omp end allocators
 end subroutine allocate
 
 !CHECK: INTEGER, ALLOCATABLE :: arr1(:), arr2(:,:)
-!CHECK-NEXT:!$OMP ALLOCATE ALLOCATE(omp_default_mem_alloc: arr1)
+!CHECK-NEXT:!$OMP ALLOCATORS ALLOCATE(omp_default_mem_alloc: arr1)
 !CHECK-NEXT: ALLOCATE(arr1(5))
-!CHECK-NEXT:!$OMP ALLOCATE ALLOCATE(ALLOCATOR(omp_default_mem_alloc), ALIGN(32): arr1) ALL&
-!CHECK-NEXT:!$OMP&OCATE(omp_default_mem_alloc: arr2)
+!CHECK-NEXT:!$OMP ALLOCATORS ALLOCATE(ALLOCATOR(omp_default_mem_alloc), ALIGN(32): arr1) A&
+!CHECK-NEXT:!$OMP&LLOCATE(omp_default_mem_alloc: arr2)
 !CHECK-NEXT: ALLOCATE(arr1(10), arr2(3,2))
-!CHECK-NEXT:!$OMP ALLOCATE ALLOCATE(ALIGN(32): arr2)
+!CHECK-NEXT:!$OMP ALLOCATORS ALLOCATE(ALIGN(32): arr2)
 !CHECK-NEXT: ALLOCATE(arr2(5,3))
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAllocatorsConstruct
-!PARSE-TREE-NEXT: Verbatim
-!PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
-!PARSE-TREE-NEXT: Modifier -> OmpAllocatorSimpleModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
-!PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr1'
-!PARSE-TREE-NEXT: AllocateStmt
-!PARSE-TREE-NEXT: Allocation
-!PARSE-TREE-NEXT: AllocateObject -> Name = 'arr1'
+!PARSE-TREE-NEXT: | OmpDirectiveSpecification
+!PARSE-TREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocators
+!PARSE-TREE-NEXT: | | OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
+!PARSE-TREE-NEXT: | | | Modifier -> OmpAllocatorSimpleModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!PARSE-TREE-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr1'
+!PARSE-TREE-NEXT: | | Flags = None
+!PARSE-TREE-NEXT: | Block
+!PARSE-TREE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
+!PARSE-TREE-NEXT: | | | Allocation
+!PARSE-TREE-NEXT: | | | | AllocateObject -> Name = 'arr1'
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAllocatorsConstruct
-!PARSE-TREE-NEXT: Verbatim
-!PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
-!PARSE-TREE-NEXT: Modifier -> OmpAllocatorComplexModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
-!PARSE-TREE-NEXT: Modifier -> OmpAlignModifier -> Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '32'
-!PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr1'
-!PARSE-TREE-NEXT: OmpClause -> Allocate -> OmpAllocateClause
-!PARSE-TREE-NEXT: Modifier -> OmpAllocatorSimpleModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
-!PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr2'
-!PARSE-TREE-NEXT: AllocateStmt
-!PARSE-TREE-NEXT: Allocation
-!PARSE-TREE-NEXT: AllocateObject -> Name = 'arr1'
-!PARSE-TREE-NEXT: AllocateShapeSpec
-!PARSE-TREE-NEXT: Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '10'
-!PARSE-TREE-NEXT: Allocation
-!PARSE-TREE-NEXT: AllocateObject -> Name = 'arr2'
+!PARSE-TREE-NEXT: | OmpDirectiveSpecification
+!PARSE-TREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocators
+!PARSE-TREE-NEXT: | | OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
+!PARSE-TREE-NEXT: | | | Modifier -> OmpAllocatorComplexModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!PARSE-TREE-NEXT: | | | Modifier -> OmpAlignModifier -> Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '32'
+!PARSE-TREE-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr1'
+!PARSE-TREE-NEXT: | | OmpClause -> Allocate -> OmpAllocateClause
+!PARSE-TREE-NEXT: | | | Modifier -> OmpAllocatorSimpleModifier -> Scalar -> Integer -> Expr -> Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!PARSE-TREE-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr2'
+!PARSE-TREE-NEXT: | | Flags = None
+!PARSE-TREE-NEXT: | Block
+!PARSE-TREE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
+!PARSE-TREE-NEXT: | | | Allocation
+!PARSE-TREE-NEXT: | | | | AllocateObject -> Name = 'arr1'
 
 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAllocatorsConstruct
-!PARSE-TREE-NEXT: Verbatim
-!PARSE-TREE-NEXT: OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
-!PARSE-TREE-NEXT: Modifier -> OmpAlignModifier -> Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '32'
-!PARSE-TREE-NEXT: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr2'
-!PARSE-TREE-NEXT: AllocateStmt
-!PARSE-TREE-NEXT: Allocation
-!PARSE-TREE-NEXT: AllocateObject -> Name = 'arr2'
+!PARSE-TREE-NEXT: | OmpDirectiveSpecification
+!PARSE-TREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocators
+!PARSE-TREE-NEXT: | | OmpClauseList -> OmpClause -> Allocate -> OmpAllocateClause
+!PARSE-TREE-NEXT: | | | Modifier -> OmpAlignModifier -> Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '32'
+!PARSE-TREE-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'arr2'
+!PARSE-TREE-NEXT: | | Flags = None
+!PARSE-TREE-NEXT: | Block
+!PARSE-TREE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
+!PARSE-TREE-NEXT: | | | Allocation
+!PARSE-TREE-NEXT: | | | | AllocateObject -> Name = 'arr2'
+!PARSE-TREE-NEXT: | | | | AllocateShapeSpec
+!PARSE-TREE-NEXT: | | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '5'
+!PARSE-TREE-NEXT: | | | | AllocateShapeSpec
+!PARSE-TREE-NEXT: | | | | | Scalar -> Integer -> Expr -> LiteralConstant -> IntLiteralConstant = '3'
+!PARSE-TREE-NEXT: | OmpDirectiveSpecification
+!PARSE-TREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocators
+!PARSE-TREE-NEXT: | | OmpClauseList ->
+!PARSE-TREE-NEXT: | | Flags = None
diff --git a/flang/test/Parser/OpenMP/dispatch.f90 b/flang/test/Parser/OpenMP/dispatch.f90
index 98cd6090334f3..4076c00331225 100644
--- a/flang/test/Parser/OpenMP/dispatch.f90
+++ b/flang/test/Parser/OpenMP/dispatch.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s | FileCheck %s --check-prefix=PARSE-TREE
 ! RUN: %flang_fc1 -fopenmp -fdebug-unparse %s | FileCheck %s --check-prefix="UNPARSE"
 
 integer function func(a, b, c)
@@ -12,40 +12,57 @@ subroutine sub(x)
   integer :: r
   type(c_ptr) :: x
   integer :: a = 14, b = 7, c = 21
+
 !UNPARSE: !$OMP DISPATCH DEVICE(3_4) NOWAIT NOCONTEXT(.false._4) NOVARIANTS(.true._4)
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
-!CHECK-NEXT: | | | OmpDispatchDirective
-!CHECK: | | | | OmpClauseList -> OmpClause -> Device -> OmpDeviceClause
-!CHECK-NEXT: | | | | | Scalar -> Integer -> Expr = '3_4'
-!CHECK-NEXT: | | | | | | LiteralConstant -> IntLiteralConstant = '3'
-!CHECK-NEXT: | | | | OmpClause -> Nowait
-!CHECK-NEXT: | | | | OmpClause -> Nocontext -> Scalar -> Logical -> Expr = '.false._4'
-!CHECK-NEXT: | | | | | LiteralConstant -> LogicalLiteralConstant
-!CHECK-NEXT: | | | | | | bool = 'false'
-!CHECK-NEXT: | | | | OmpClause -> Novariants -> Scalar -> Logical -> Expr = '.true._4'
-!CHECK-NEXT: | | | | | EQ
-!CHECK-NEXT: | | | | | | Expr = '1_4'
-!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-NEXT: | | | | | | Expr = '1_4'
-!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-NEXT: | | | Block
- 
+!UNPARSE:   r=func(a,b,c)
+!UNPARSE: !$OMP END DISPATCH
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = dispatch
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Device -> OmpDeviceClause
+!PARSE-TREE: | | | Scalar -> Integer -> Expr = '3_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '3'
+!PARSE-TREE: | | OmpClause -> Nowait
+!PARSE-TREE: | | OmpClause -> Nocontext -> Scalar -> Logical -> Expr = '.false._4'
+!PARSE-TREE: | | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | | bool = 'false'
+!PARSE-TREE: | | OmpClause -> Novariants -> Scalar -> Logical -> Expr = '.true._4'
+!PARSE-TREE: | | | EQ
+!PARSE-TREE: | | | | Expr = '1_4'
+!PARSE-TREE: | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | | | | Expr = '1_4'
+!PARSE-TREE: | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt
+![...]
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = dispatch
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
   !$omp dispatch device(3) nowait nocontext(.false.) novariants(1.eq.1)
   r = func(a, b, c)
-!UNPARSE: !$OMP END DISPATCH
-!CHECK: | | | OmpEndDispatchDirective
   !$omp end dispatch
 
 !! Test the "no end dispatch" option.
-!UNPARSE: !$OMP DISPATCH  DEVICE(3_4) IS_DEVICE_PTR(x)
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
-!CHECK-NEXT: | | | OmpDispatchDirective
-!CHECK: | | | | OmpClause -> IsDevicePtr ->  OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'  
+!UNPARSE: !$OMP DISPATCH DEVICE(3_4) IS_DEVICE_PTR(x)
+!UNPARSE:   r=func(a+1_4,b+2_4,c+3_4)
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = dispatch
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Device -> OmpDeviceClause
+!PARSE-TREE: | | | Scalar -> Integer -> Expr = '3_4'
+!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '3'
+!PARSE-TREE: | | OmpClause -> IsDevicePtr -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt
+!PARSE-TREE-NOT: OmpDirectiveSpecification
+
   !$omp dispatch device(3) is_device_ptr(x)
   r = func(a+1, b+2, c+3)
-!CHECK-NOT: | | | OmpEndDispatchDirective
 
 end subroutine sub
-
-
-
diff --git a/flang/test/Semantics/OpenMP/allocators07.f90 b/flang/test/Semantics/OpenMP/allocators07.f90
new file mode 100644
index 0000000000000..a28f706965cb1
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocators07.f90
@@ -0,0 +1,27 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+subroutine f00
+  implicit none
+  integer, allocatable :: a(:)
+
+  !$omp allocators allocate(a)
+!ERROR: The body of the ALLOCATORS construct should be an ALLOCATE statement
+  continue
+end
+
+subroutine f01
+  implicit none
+  integer, allocatable :: a(:)
+
+!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement
+  !$omp allocators allocate(a)
+  !$omp end allocators
+end
+
+subroutine f02
+  implicit none
+  integer, allocatable :: a(:)
+
+!ERROR: The ALLOCATORS construct should contain a single ALLOCATE statement
+  !$omp allocators allocate(a)
+end
diff --git a/flang/test/Semantics/OpenMP/dispatch.f90 b/flang/test/Semantics/OpenMP/dispatch.f90
index 7dfbeecb2fc1d..af0d6856ab948 100644
--- a/flang/test/Semantics/OpenMP/dispatch.f90
+++ b/flang/test/Semantics/OpenMP/dispatch.f90
@@ -1,24 +1,20 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
 
 subroutine sb1
   integer :: r
   r = 1
-  !ERROR: The DISPATCH construct does not contain a SUBROUTINE or FUNCTION
   !$omp dispatch nowait
+!ERROR: The body of the DISPATCH construct should be a function or a subroutine call
   print *,r
 end subroutine
+
 subroutine sb2
-  integer :: r
-!ERROR: The DISPATCH construct is empty or contains more than one statement
+!ERROR: The DISPATCH construct should contain a single function or subroutine call
   !$omp dispatch
-  call foo()
-  r = bar()
   !$omp end dispatch
-contains
-  subroutine foo
-  end subroutine foo
-  function bar
-    integer :: bar
-    bar = 2
-  end function
+end subroutine
+
+subroutine sb3
+!ERROR: The DISPATCH construct should contain a single function or subroutine call
+  !$omp dispatch
 end subroutine
diff --git a/flang/test/Transforms/DoConcurrent/reduce_add.mlir b/flang/test/Transforms/DoConcurrent/reduce_add.mlir
new file mode 100644
index 0000000000000..1ea3e3e527335
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/reduce_add.mlir
@@ -0,0 +1,73 @@
+// Tests mapping reductions from fir to OpenMP.
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  %c0_i32 = arith.constant 0 : i32
+  fir.yield(%c0_i32 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.addi %arg0, %arg1 : i32
+  fir.yield(%0 : i32)
+}
+
+func.func @_QPdo_concurrent_reduce() {
+  %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 #fir.reduce_attr<add> %4#0 -> %arg1 : !fir.ref<i32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+      %10:2 = hlfir.declare %arg1 {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %11 = fir.load %10#0 : !fir.ref<i32>
+      %c1_i32_0 = arith.constant 1 : i32
+      %12 = arith.addi %11, %c1_i32_0 : i32
+      hlfir.assign %12 to %10#0 : i32, !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   omp.declare_reduction @add_reduction_i32.omp : i32 init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+// CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+// CHECK-LABEL:   } combiner {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+// CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
+// CHECK:           omp.yield(%[[VAL_2]] : i32)
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPdo_concurrent_reduce() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           omp.parallel {
+// CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK:             omp.wsloop reduction(@add_reduction_i32.omp %[[VAL_3]]#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) {
+// CHECK:               omp.loop_nest (%[[VAL_8:.*]]) : index = (%[[VAL_4]]) to (%[[VAL_4]]) inclusive step (%[[VAL_4]]) {
+// CHECK:                 %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
+// CHECK:                 fir.store %[[VAL_9]] to %[[VAL_6]]#0 : !fir.ref<i32>
+// CHECK:                 %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
+// CHECK:                 %[[VAL_12:.*]] = arith.constant 1 : i32
+// CHECK:                 %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+// CHECK:                 hlfir.assign %[[VAL_13]] to %[[VAL_10]]#0 : i32, !fir.ref<i32>
+// CHECK:                 omp.yield
+// CHECK:               }
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+
+// CHECK:           return
+// CHECK:         }
+
diff --git a/flang/test/Transforms/DoConcurrent/reduce_all_regions.mlir b/flang/test/Transforms/DoConcurrent/reduce_all_regions.mlir
new file mode 100644
index 0000000000000..3d5b8bf22af75
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/reduce_all_regions.mlir
@@ -0,0 +1,70 @@
+// Tests mapping reductions from fir to OpenMP (all regions).
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  fir.yield(%arg0 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  fir.yield(%arg0 : i32)
+} atomic {
+^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<i32>):
+  fir.yield(%arg0 : !fir.ref<i32>)
+} cleanup {
+^bb0(%arg0: i32):
+  fir.yield
+}
+
+func.func @_QPdo_concurrent_reduce() {
+  %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 #fir.reduce_attr<add> %4#0 -> %arg1 : !fir.ref<i32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   omp.declare_reduction @add_reduction_i32.omp : i32 init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+// CHECK:           omp.yield(%[[VAL_0]] : i32)
+
+// CHECK-LABEL:   } combiner {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+// CHECK:           omp.yield(%[[VAL_0]] : i32)
+
+// CHECK-LABEL:   } atomic {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>, %[[VAL_1:.*]]: !fir.ref<i32>):
+// CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<i32>)
+
+// CHECK-LABEL:   } cleanup {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+// CHECK:           omp.yield
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPdo_concurrent_reduce() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK:           omp.parallel {
+// CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:             omp.wsloop reduction(@add_reduction_i32.omp %[[VAL_3]]#0 -> %[[VAL_7:.*]] : !fir.ref<i32>) {
+// CHECK:               omp.loop_nest (%[[VAL_8:.*]]) : index = (%[[VAL_4]]) to (%[[VAL_4]]) inclusive step (%[[VAL_4]]) {
+// CHECK:                 %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
+// CHECK:                 fir.store %[[VAL_9]] to %[[VAL_6]]#0 : !fir.ref<i32>
+// CHECK:                 omp.yield
+// CHECK:               }
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Transforms/DoConcurrent/reduce_local.mlir b/flang/test/Transforms/DoConcurrent/reduce_local.mlir
new file mode 100644
index 0000000000000..0f667109e6e83
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/reduce_local.mlir
@@ -0,0 +1,83 @@
+// Tests mapping reductions and local from fir to OpenMP.
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  %c0_i32 = arith.constant 0 : i32
+  fir.yield(%c0_i32 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.addi %arg0, %arg1 : i32
+  fir.yield(%0 : i32)
+}
+  fir.local {type = local} @_QFdo_concurrent_reduceEl_private_i32 : i32
+  func.func @_QPdo_concurrent_reduce() {
+  %3 = fir.alloca i32 {bindc_name = "l", uniq_name = "_QFdo_concurrent_reduceEl"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFdo_concurrent_reduceEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c1 = arith.constant 1 : index
+  fir.do_concurrent {
+    %9 = fir.alloca i32 {bindc_name = "i"}
+    %10:2 = hlfir.declare %9 {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@_QFdo_concurrent_reduceEl_private_i32 %4#0 -> %arg1 : !fir.ref<i32>) reduce(@add_reduction_i32 #fir.reduce_attr<add> %6#0 -> %arg2 : !fir.ref<i32>) {
+      %11 = fir.convert %arg0 : (index) -> i32
+      fir.store %11 to %10#0 : !fir.ref<i32>
+      %12:2 = hlfir.declare %arg1 {uniq_name = "_QFdo_concurrent_reduceEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %13:2 = hlfir.declare %arg2 {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %c1_i32_0 = arith.constant 1 : i32
+      hlfir.assign %c1_i32_0 to %12#0 : i32, !fir.ref<i32>
+      %14 = fir.load %13#0 : !fir.ref<i32>
+      %15 = fir.load %12#0 : !fir.ref<i32>
+      %16 = arith.addi %14, %15 : i32
+      hlfir.assign %16 to %13#0 : i32, !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   omp.declare_reduction @add_reduction_i32.omp : i32 init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+// CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+// CHECK-LABEL:   } combiner {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+// CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
+// CHECK:           omp.yield(%[[VAL_2]] : i32)
+// CHECK:         }
+
+// CHECK:         omp.private {type = private} @_QFdo_concurrent_reduceEl_private_i32.omp : i32
+
+// CHECK-LABEL:   func.func @_QPdo_concurrent_reduce() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "l", uniq_name = "_QFdo_concurrent_reduceEl"}
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFdo_concurrent_reduceEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           omp.parallel {
+// CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:             omp.wsloop private(@_QFdo_concurrent_reduceEl_private_i32.omp %[[VAL_3]]#0 -> %[[VAL_9:.*]] : !fir.ref<i32>) reduction(@add_reduction_i32.omp %[[VAL_5]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>) {
+// CHECK:               omp.loop_nest (%[[VAL_11:.*]]) : index = (%[[VAL_6]]) to (%[[VAL_6]]) inclusive step (%[[VAL_6]]) {
+// CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
+// CHECK:                 fir.store %[[VAL_12]] to %[[VAL_8]]#0 : !fir.ref<i32>
+// CHECK:                 %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFdo_concurrent_reduceEl"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:                 %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:                 %[[VAL_15:.*]] = arith.constant 1 : i32
+// CHECK:                 hlfir.assign %[[VAL_15]] to %[[VAL_13]]#0 : i32, !fir.ref<i32>
+// CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+// CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+// CHECK:                 %[[VAL_18:.*]] = arith.addi %[[VAL_16]], %[[VAL_17]] : i32
+// CHECK:                 hlfir.assign %[[VAL_18]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+// CHECK:                 omp.yield
+// CHECK:               }
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
diff --git a/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir b/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir
index d9ef36b175598..c550ab8a97d4c 100644
--- a/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir
+++ b/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir
@@ -86,7 +86,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index,
     %j = fir.alloca i32
     fir.do_concurrent.loop
       (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) 
-      reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) {
+      reduce(@add_reduction_i32 #fir.reduce_attr<add> %sum -> %sum_arg : !fir.ref<i32>) {
       %0 = fir.convert %i_iv : (index) -> i32
       fir.store %0 to %i : !fir.ref<i32>
 
diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
index e72b96fe7cd10..de6cb1d09080d 100644
--- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
+++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp
@@ -58,8 +58,18 @@ struct TestFIROpenACCInterfaces
         llvm::errs() << "Visiting: " << *op << "\n";
         llvm::errs() << "\tVar: " << var << "\n";
 
-        if (auto ptrTy = dyn_cast_if_present<acc::PointerLikeType>(typeOfVar)) {
+        if (mlir::isa<acc::PointerLikeType>(typeOfVar) &&
+            mlir::isa<acc::MappableType>(typeOfVar)) {
+          llvm::errs() << "\tPointer-like and Mappable: " << typeOfVar << "\n";
+        } else if (mlir::isa<acc::PointerLikeType>(typeOfVar)) {
           llvm::errs() << "\tPointer-like: " << typeOfVar << "\n";
+        } else {
+          assert(
+              mlir::isa<acc::MappableType>(typeOfVar) && "expected mappable");
+          llvm::errs() << "\tMappable: " << typeOfVar << "\n";
+        }
+
+        if (auto ptrTy = dyn_cast_if_present<acc::PointerLikeType>(typeOfVar)) {
           // If the pointee is not mappable, print details about it. Otherwise,
           // we defer to the mappable printing below to print those details.
           if (!mappableTy) {
@@ -72,8 +82,6 @@ struct TestFIROpenACCInterfaces
         }
 
         if (mappableTy) {
-          llvm::errs() << "\tMappable: " << mappableTy << "\n";
-
           acc::VariableTypeCategory typeCategory =
               mappableTy.getTypeCategory(var);
           llvm::errs() << "\t\tType category: " << typeCategory << "\n";
diff --git a/libc/include/llvm-libc-macros/wchar-macros.h b/libc/include/llvm-libc-macros/wchar-macros.h
index 5b211f5276b62..2a0cabd6133a4 100644
--- a/libc/include/llvm-libc-macros/wchar-macros.h
+++ b/libc/include/llvm-libc-macros/wchar-macros.h
@@ -9,8 +9,10 @@
 #ifndef LLVM_LIBC_MACROS_WCHAR_MACROS_H
 #define LLVM_LIBC_MACROS_WCHAR_MACROS_H
 
+#include "../llvm-libc-types/wint_t.h"
+
 #ifndef WEOF
-#define WEOF 0xffffffffu
+#define WEOF ((wint_t)(0xffffffffu))
 #endif
 
 #endif // LLVM_LIBC_MACROS_WCHAR_MACROS_H
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 4ddc29c7ae834..9db53b69041d0 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -12,5 +12,6 @@
 #include "libc_common.h"
 
 #include "math/expf.h"
+#include "math/expf16.h"
 
 #endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/expf16.h b/libc/shared/math/expf16.h
new file mode 100644
index 0000000000000..a6a3e89e680d4
--- /dev/null
+++ b/libc/shared/math/expf16.h
@@ -0,0 +1,29 @@
+//===-- Shared expf16 function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXPF16_H
+#define LLVM_LIBC_SHARED_MATH_EXPF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/expf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::expf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXPF16_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 7e85136c08851..294d68474bd53 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -180,6 +180,19 @@ add_header_library(
     libc.src.__support.common
 )
 
+add_header_library(
+  wcs_to_integer
+  HDRS
+    wcs_to_integer.h
+  DEPENDS
+    .wctype_utils
+    .str_to_num_result
+    libc.hdr.errno_macros
+    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.common
+)
+
 add_header_library(
   integer_to_string
   HDRS
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 66c1d19a1cab0..4c73fba6613fa 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -22,3 +22,36 @@ add_header_library(
     libc.src.__support.macros.config
     libc.src.__support.macros.optimization
 )
+
+add_header_library(
+  expf16_utils
+  HDRS
+    expf16_utils.h
+  DEPENDS
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.macros.attributes
+    libc.include.llvm-libc-macros.float16_macros
+)
+
+add_header_library(
+  expf16
+  HDRS
+    expf16.h
+  DEPENDS
+    .expf16_utils
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.include.llvm-libc-macros.float16_macros
+)
diff --git a/libc/src/__support/math/expf16.h b/libc/src/__support/math/expf16.h
new file mode 100644
index 0000000000000..ded28c7dba500
--- /dev/null
+++ b/libc/src/__support/math/expf16.h
@@ -0,0 +1,141 @@
+//===-- Implementation header for expf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+#include "expf16_utils.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float16 expf16(float16 x) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr fputil::ExceptValues<float16, 2> EXPF16_EXCEPTS_LO = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.de4p-8, expf16(x) = 0x1.01cp+0 (RZ)
+      {0x1f79U, 0x3c07U, 1U, 0U, 0U},
+      // x = 0x1.73cp-6, expf16(x) = 0x1.05cp+0 (RZ)
+      {0x25cfU, 0x3c17U, 1U, 0U, 0U},
+  }};
+
+  constexpr fputil::ExceptValues<float16, 3> EXPF16_EXCEPTS_HI = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.c34p+0, expf16(x) = 0x1.74cp+2 (RZ)
+      {0x3f0dU, 0x45d3U, 1U, 0U, 1U},
+      // x = -0x1.488p-5, expf16(x) = 0x1.ebcp-1 (RZ)
+      {0xa922U, 0x3bafU, 1U, 0U, 0U},
+      // x = -0x1.55p-5, expf16(x) = 0x1.ebp-1 (RZ)
+      {0xa954U, 0x3bacU, 1U, 0U, 0U},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When 0 < |x| <= 2^(-5), or |x| >= 12, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x2800U || x_abs >= 0x4a00U)) {
+    // exp(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 12.
+    if (x_bits.is_pos() && x_abs >= 0x4a00U) {
+      // exp(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -18.
+    if (x_u >= 0xcc80U) {
+      // exp(-inf) = +0
+      if (x_bits.is_inf())
+        return FPBits::zero().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+      switch (fputil::quick_get_round()) {
+      case FE_UPWARD:
+        return FPBits::min_subnormal().get_val();
+      default:
+        return FPBits::zero().get_val();
+      }
+    }
+
+    // When 0 < |x| <= 2^(-5).
+    if (x_abs <= 0x2800U && !x_bits.is_zero()) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      if (auto r = EXPF16_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      float xf = x;
+      // Degree-3 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]);
+      //   > 1 + x * P;
+      return fputil::cast<float16>(
+          fputil::polyeval(xf, 0x1p+0f, 0x1p+0f, 0x1.0004p-1f, 0x1.555778p-3f));
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXPF16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // exp(x) = exp(hi + mid) * exp(lo)
+  auto [exp_hi_mid, exp_lo] = exp_range_reduction(x);
+  return fputil::cast<float16>(exp_hi_mid * exp_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_H
diff --git a/libc/src/__support/math/expf16_utils.h b/libc/src/__support/math/expf16_utils.h
new file mode 100644
index 0000000000000..bebb72b09b886
--- /dev/null
+++ b/libc/src/__support/math/expf16_utils.h
@@ -0,0 +1,89 @@
+//===-- Common utils for expf16 functions -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_UTILS_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
+static constexpr cpp::array<float, 31> EXP_HI = {
+    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
+    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
+    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
+    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
+    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
+    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
+    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
+    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
+};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
+static constexpr cpp::array<float, 8> EXP_MID = {
+    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
+    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
+};
+
+struct ExpRangeReduction {
+  float exp_hi_mid;
+  float exp_lo;
+};
+
+static constexpr ExpRangeReduction exp_range_reduction(float16 x) {
+  // For -18 < x < 12, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo, such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
+  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
+  // generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
+
+  float exp_hi = EXP_HI[x_hi + 18];
+  float exp_mid = EXP_MID[x_mid];
+  // Degree-3 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp_lo =
+      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
+  return {exp_hi * exp_mid, exp_lo};
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXPF16_UTILS_H
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 0748e1cb8a8b4..a7dd7ce0ae25a 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1135,7 +1135,7 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
 
   int error = 0;
 
-  size_t index = static_cast<size_t>(first_non_whitespace(src) - src);
+  size_t index = first_non_whitespace(src);
 
   if (src[index] == '+' || src[index] == '-') {
     sign = src[index];
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index 76a99a8948941..d332c929f2c31 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -29,17 +29,16 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// Returns a pointer to the first character in src that is not a whitespace
+// Returns the idx to the first character in src that is not a whitespace
 // character (as determined by isspace())
-// TODO: Change from returning a pointer to returning a length.
-LIBC_INLINE const char *
+LIBC_INLINE size_t
 first_non_whitespace(const char *__restrict src,
                      size_t src_len = cpp::numeric_limits<size_t>::max()) {
   size_t src_cur = 0;
   while (src_cur < src_len && internal::isspace(src[src_cur])) {
     ++src_cur;
   }
-  return src + src_cur;
+  return src_cur;
 }
 
 // checks if the next 3 characters of the string pointer are the start of a
@@ -96,7 +95,7 @@ strtointeger(const char *__restrict src, int base,
   if (base < 0 || base == 1 || base > 36)
     return {0, 0, EINVAL};
 
-  src_cur = static_cast<size_t>(first_non_whitespace(src, src_len) - src);
+  src_cur = first_non_whitespace(src, src_len);
 
   char result_sign = '+';
   if (src[src_cur] == '+' || src[src_cur] == '-') {
diff --git a/libc/src/__support/wcs_to_integer.h b/libc/src/__support/wcs_to_integer.h
new file mode 100644
index 0000000000000..4254bd860f77a
--- /dev/null
+++ b/libc/src/__support/wcs_to_integer.h
@@ -0,0 +1,155 @@
+//===-- Widechar string to integer conversion utils -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
+#define LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
+
+#include "hdr/errno_macros.h" // For ERANGE
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/CPP/type_traits/make_unsigned.h"
+#include "src/__support/big_int.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_num_result.h"
+#include "src/__support/uint128.h"
+#include "src/__support/wctype_utils.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Returns the idx of the first character in src that is not a whitespace
+// character (as determined by iswspace())
+LIBC_INLINE size_t
+first_non_whitespace(const wchar_t *__restrict src,
+                     size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  size_t src_cur = 0;
+  while (src_cur < src_len && internal::iswspace(src[src_cur])) {
+    ++src_cur;
+  }
+  return src_cur;
+}
+
+// checks if the next 3 characters of the string pointer are the start of a
+// hexadecimal number. Does not advance the string pointer.
+LIBC_INLINE bool
+is_hex_start(const wchar_t *__restrict src,
+             size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  if (src_len < 3)
+    return false;
+  return *src == L'0' && towlower(*(src + 1)) == L'x' && iswalnum(*(src + 2)) &&
+         b36_wchar_to_int(*(src + 2)) < 16;
+}
+
+// Takes the address of the string pointer and parses the base from the start of
+// it.
+LIBC_INLINE int infer_base(const wchar_t *__restrict src, size_t src_len) {
+  // A hexadecimal number is defined as "the prefix 0x or 0X followed by a
+  // sequence of the decimal digits and the letters a (or A) through f (or F)
+  // with values 10 through 15 respectively." (C standard 6.4.4.1)
+  if (is_hex_start(src, src_len))
+    return 16;
+  // An octal number is defined as "the prefix 0 optionally followed by a
+  // sequence of the digits 0 through 7 only" (C standard 6.4.4.1) and so any
+  // number that starts with 0, including just 0, is an octal number.
+  if (src_len > 0 && src[0] == L'0')
+    return 8;
+  // A decimal number is defined as beginning "with a nonzero digit and
+  // consist[ing] of a sequence of decimal digits." (C standard 6.4.4.1)
+  return 10;
+}
+
+template <class T>
+LIBC_INLINE StrToNumResult<T>
+wcstointeger(const wchar_t *__restrict src, int base,
+             const size_t src_len = cpp::numeric_limits<size_t>::max()) {
+  using ResultType = make_integral_or_big_int_unsigned_t<T>;
+
+  ResultType result = 0;
+
+  bool is_number = false;
+  size_t src_cur = 0;
+  int error_val = 0;
+
+  if (src_len == 0)
+    return {0, 0, 0};
+
+  if (base < 0 || base == 1 || base > 36)
+    return {0, 0, EINVAL};
+
+  src_cur = first_non_whitespace(src, src_len);
+
+  wchar_t result_sign = L'+';
+  if (src[src_cur] == L'+' || src[src_cur] == L'-') {
+    result_sign = src[src_cur];
+    ++src_cur;
+  }
+
+  if (base == 0)
+    base = infer_base(src + src_cur, src_len - src_cur);
+
+  if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
+    src_cur = src_cur + 2;
+
+  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
+  const bool is_positive = (result_sign == L'+');
+
+  ResultType constexpr NEGATIVE_MAX =
+      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
+                   : cpp::numeric_limits<T>::max();
+  ResultType const abs_max =
+      (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
+  ResultType const abs_max_div_by_base =
+      abs_max / static_cast<ResultType>(base);
+
+  while (src_cur < src_len && iswalnum(src[src_cur])) {
+    int cur_digit = b36_wchar_to_int(src[src_cur]);
+    if (cur_digit >= base)
+      break;
+
+    is_number = true;
+    ++src_cur;
+
+    // If the number has already hit the maximum value for the current type then
+    // the result cannot change, but we still need to advance src to the end of
+    // the number.
+    if (result == abs_max) {
+      error_val = ERANGE;
+      continue;
+    }
+
+    if (result > abs_max_div_by_base) {
+      result = abs_max;
+      error_val = ERANGE;
+    } else {
+      result = result * static_cast<ResultType>(base);
+    }
+    if (result > abs_max - static_cast<ResultType>(cur_digit)) {
+      result = abs_max;
+      error_val = ERANGE;
+    } else {
+      result = result + static_cast<ResultType>(cur_digit);
+    }
+  }
+
+  ptrdiff_t str_len = is_number ? static_cast<ptrdiff_t>(src_cur) : 0;
+
+  if (error_val == ERANGE) {
+    if (is_positive || IS_UNSIGNED)
+      return {cpp::numeric_limits<T>::max(), str_len, error_val};
+    else // T is signed and there is a negative overflow
+      return {cpp::numeric_limits<T>::min(), str_len, error_val};
+  }
+
+  return {static_cast<T>(is_positive ? result : -result), str_len, error_val};
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCS_TO_INTEGER_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index fd1e6c0d648aa..6c3f28f423c7b 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1332,19 +1332,8 @@ add_entrypoint_object(
   HDRS
     ../expf16.h
   DEPENDS
-    .expxf16
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.CPP.array
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.expf16
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -5075,11 +5064,10 @@ add_header_library(
   HDRS
     expxf16.h
   DEPENDS
-    libc.src.__support.CPP.array
     libc.src.__support.FPUtil.cast
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.attributes
+    libc.src.__support.math.expf16_utils
 )
diff --git a/libc/src/math/generic/expf16.cpp b/libc/src/math/generic/expf16.cpp
index 1af9b3ec9ad6e..ad213e237f021 100644
--- a/libc/src/math/generic/expf16.cpp
+++ b/libc/src/math/generic/expf16.cpp
@@ -7,120 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expf16.h"
-#include "expxf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
 
-namespace LIBC_NAMESPACE_DECL {
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr fputil::ExceptValues<float16, 2> EXPF16_EXCEPTS_LO = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.de4p-8, expf16(x) = 0x1.01cp+0 (RZ)
-    {0x1f79U, 0x3c07U, 1U, 0U, 0U},
-    // x = 0x1.73cp-6, expf16(x) = 0x1.05cp+0 (RZ)
-    {0x25cfU, 0x3c17U, 1U, 0U, 0U},
-}};
-
-static constexpr fputil::ExceptValues<float16, 3> EXPF16_EXCEPTS_HI = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.c34p+0, expf16(x) = 0x1.74cp+2 (RZ)
-    {0x3f0dU, 0x45d3U, 1U, 0U, 1U},
-    // x = -0x1.488p-5, expf16(x) = 0x1.ebcp-1 (RZ)
-    {0xa922U, 0x3bafU, 1U, 0U, 0U},
-    // x = -0x1.55p-5, expf16(x) = 0x1.ebp-1 (RZ)
-    {0xa954U, 0x3bacU, 1U, 0U, 0U},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When 0 < |x| <= 2^(-5), or |x| >= 12, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs <= 0x2800U || x_abs >= 0x4a00U)) {
-    // exp(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 12.
-    if (x_bits.is_pos() && x_abs >= 0x4a00U) {
-      // exp(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
+#include "src/__support/math/expf16.h"
 
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x <= -18.
-    if (x_u >= 0xcc80U) {
-      // exp(-inf) = +0
-      if (x_bits.is_inf())
-        return FPBits::zero().get_val();
-
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-
-      switch (fputil::quick_get_round()) {
-      case FE_UPWARD:
-        return FPBits::min_subnormal().get_val();
-      default:
-        return FPBits::zero().get_val();
-      }
-    }
-
-    // When 0 < |x| <= 2^(-5).
-    if (x_abs <= 0x2800U && !x_bits.is_zero()) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      if (auto r = EXPF16_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      float xf = x;
-      // Degree-3 minimax polynomial generated by Sollya with the following
-      // commands:
-      //   > display = hexadecimal;
-      //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]);
-      //   > 1 + x * P;
-      return fputil::cast<float16>(
-          fputil::polyeval(xf, 0x1p+0f, 0x1p+0f, 0x1.0004p-1f, 0x1.555778p-3f));
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXPF16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+namespace LIBC_NAMESPACE_DECL {
 
-  // exp(x) = exp(hi + mid) * exp(lo)
-  auto [exp_hi_mid, exp_lo] = exp_range_reduction(x);
-  return fputil::cast<float16>(exp_hi_mid * exp_lo);
-}
+LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) { return math::expf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 67bb248307519..05ac95d586823 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -9,9 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
 
-#include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
@@ -19,69 +17,9 @@
 #include "src/__support/macros/config.h"
 #include <stdint.h>
 
-namespace LIBC_NAMESPACE_DECL {
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
-static constexpr cpp::array<float, 31> EXP_HI = {
-    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
-    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
-    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
-    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
-    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
-    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
-    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
-    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
-};
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
-static constexpr cpp::array<float, 8> EXP_MID = {
-    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
-    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
-};
-
-struct ExpRangeReduction {
-  float exp_hi_mid;
-  float exp_lo;
-};
+#include "src/__support/math/expf16_utils.h"
 
-LIBC_INLINE ExpRangeReduction exp_range_reduction(float16 x) {
-  // For -18 < x < 12, to compute exp(x), we perform the following range
-  // reduction: find hi, mid, lo, such that:
-  //   x = hi + mid + lo, in which
-  //     hi is an integer,
-  //     mid * 2^3 is an integer,
-  //     -2^(-4) <= lo < 2^(-4).
-  // In particular,
-  //   hi + mid = round(x * 2^3) * 2^(-3).
-  // Then,
-  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
-  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
-  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
-  // generated by Sollya.
-
-  float xf = x;
-  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
-  int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
-  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
-  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
-
-  float exp_hi = EXP_HI[x_hi + 18];
-  float exp_mid = EXP_MID[x_mid];
-  // Degree-3 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
-  //   > 1 + x * P;
-  float exp_lo =
-      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
-  return {exp_hi * exp_mid, exp_lo};
-}
+namespace LIBC_NAMESPACE_DECL {
 
 // Generated by Sollya with the following commands:
 //   > display = hexadecimal;
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9f626ed31cc07..e54d7a5c9638b 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -141,6 +141,17 @@ add_libc_test(
     libc.src.__support.str_to_integer
 )
 
+add_libc_test(
+  wcs_to_integer_test
+  SUITE
+    libc-support-tests
+  SRCS
+    wcs_to_integer_test.cpp
+  DEPENDS
+    libc.src.__support.integer_literals
+    libc.src.__support.wcs_to_integer
+)
+
 add_libc_test(
   integer_to_string_test
   SUITE
diff --git a/libc/test/src/__support/wcs_to_integer_test.cpp b/libc/test/src/__support/wcs_to_integer_test.cpp
new file mode 100644
index 0000000000000..e4107929c15fc
--- /dev/null
+++ b/libc/test/src/__support/wcs_to_integer_test.cpp
@@ -0,0 +1,239 @@
+//===-- Unittests for wcs_to_integer --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/libc_errno.h"
+#include "src/__support/wcs_to_integer.h"
+#include <stddef.h>
+
+#include "test/UnitTest/Test.h"
+
+// This file is for testing the src_len argument and other internal interface
+// features. Primary testing is done through the public interface.
+
+TEST(LlvmLibcStrToIntegerTest, SimpleLength) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(2));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, LeadingSpaces) {
+  auto result =
+      LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 15);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"     12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, LeadingSign) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, -12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, -12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 3);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
+  ASSERT_EQ(result.value, 12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 3);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(3));
+  ASSERT_EQ(result.value, -12);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"+12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"-12345", 10, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base16PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 0, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base16PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(7));
+  ASSERT_EQ(result.value, 0x12345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 5);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(5));
+  ASSERT_EQ(result.value, 0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 2);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"0x12345", 16, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base8PrefixAutoSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 4);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
+  ASSERT_EQ(result.value, 0123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 0, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, Base8PrefixManualSelect) {
+  auto result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 6);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 012345);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 4);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(4));
+  ASSERT_EQ(result.value, 0123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 1);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(1));
+  ASSERT_EQ(result.value, 0);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"012345", 8, 0);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(0));
+  ASSERT_EQ(result.value, 0);
+}
+
+TEST(LlvmLibcStrToIntegerTest, CombinedTests) {
+  auto result =
+      LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 10);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(10));
+  ASSERT_EQ(result.value, -0x123);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 8);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(8));
+  ASSERT_EQ(result.value, -0x1);
+
+  result = LIBC_NAMESPACE::internal::wcstointeger<int>(L"    -0x123", 0, 7);
+  EXPECT_FALSE(result.has_error());
+  EXPECT_EQ(result.parsed_len, ptrdiff_t(6));
+  ASSERT_EQ(result.value, 0);
+}
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 745ccbc748ecd..2b460aef6ef32 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -113,7 +113,8 @@ class RoundToIntegerTestTemplate
   }
 
   void testSubnormalRange(RoundToIntegerFunc func) {
-    constexpr int COUNT = 1'000'001;
+    // Arbitrary, trades off completeness with testing time (esp. on failure)
+    constexpr int COUNT = 1'000;
     constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max(
         static_cast<StorageType>((MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT),
         StorageType(1));
diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp
index 903084da053a1..ed825aa0a2adf 100644
--- a/libcxx/src/atomic.cpp
+++ b/libcxx/src/atomic.cpp
@@ -41,6 +41,10 @@
 // OpenBSD has no indirect syscalls
 #  define _LIBCPP_FUTEX(...) futex(__VA_ARGS__)
 
+#elif defined(__APPLE__) && defined(_LIBCPP_USE_ULOCK)
+
+#  include <os/os_sync_wait_on_address.h>
+
 #else // <- Add other operating systems here
 
 // Baseline needs no new headers
@@ -65,24 +69,15 @@ static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const vo
 
 #elif defined(__APPLE__) && defined(_LIBCPP_USE_ULOCK)
 
-extern "C" int __ulock_wait(
-    uint32_t operation, void* addr, uint64_t value, uint32_t timeout); /* timeout is specified in microseconds */
-extern "C" int __ulock_wake(uint32_t operation, void* addr, uint64_t wake_value);
-
-// https://github.com/apple/darwin-xnu/blob/2ff845c2e033bd0ff64b5b6aa6063a1f8f65aa32/bsd/sys/ulock.h#L82
-#  define UL_COMPARE_AND_WAIT64 5
-#  define ULF_WAKE_ALL 0x00000100
-
 static void
 __libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) {
   static_assert(sizeof(__cxx_atomic_contention_t) == 8, "Waiting on 8 bytes value");
-  __ulock_wait(UL_COMPARE_AND_WAIT64, const_cast<__cxx_atomic_contention_t*>(__ptr), __val, 0);
+  os_sync_wait_on_address(const_cast<__cxx_atomic_contention_t*>(__ptr), __val, 8, OS_SYNC_WAIT_ON_ADDRESS_NONE);
 }
 
 static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const volatile* __ptr, bool __notify_one) {
   static_assert(sizeof(__cxx_atomic_contention_t) == 8, "Waking up on 8 bytes value");
-  __ulock_wake(
-      UL_COMPARE_AND_WAIT64 | (__notify_one ? 0 : ULF_WAKE_ALL), const_cast<__cxx_atomic_contention_t*>(__ptr), 0);
+  os_sync_wake_by_address_all(const_cast<__cxx_atomic_contention_t*>(__ptr), 8, OS_SYNC_WAKE_BY_ADDRESS_NONE);
 }
 
 #elif defined(__FreeBSD__) && __SIZEOF_LONG__ == 8
diff --git a/libcxx/test/configs/stdlib-libstdc++.cfg.in b/libcxx/test/configs/stdlib-libstdc++.cfg.in
index b9672f038a763..3ff0c542f0630 100644
--- a/libcxx/test/configs/stdlib-libstdc++.cfg.in
+++ b/libcxx/test/configs/stdlib-libstdc++.cfg.in
@@ -9,7 +9,8 @@
 #
 #  $ ./libcxx/utils/libcxx-lit <BUILD> -sv libcxx/test/std --param libstdcxx_install_prefix=/opt/homebrew/Cellar/gcc/14.1.0_1 \
 #                                                          --param libstdcxx_version=14 \
-#                                                          --param libstdcxx_triple=aarch64-apple-darwin22
+#                                                          --param libstdcxx_triple=aarch64-apple-darwin22 \
+#                                                          --param stdlib=libstdc++
 #
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
diff --git a/libcxx/test/libcxx/language.support/support.types/cstddef.compile.pass.cpp b/libcxx/test/extensions/all/cstddef.compile.pass.cpp
similarity index 99%
rename from libcxx/test/libcxx/language.support/support.types/cstddef.compile.pass.cpp
rename to libcxx/test/extensions/all/cstddef.compile.pass.cpp
index 514353a103029..c6f670d7bfc06 100644
--- a/libcxx/test/libcxx/language.support/support.types/cstddef.compile.pass.cpp
+++ b/libcxx/test/extensions/all/cstddef.compile.pass.cpp
@@ -11,6 +11,7 @@
 // appear to provide that behavior too.
 
 #include <cstddef>
+
 #include "test_macros.h"
 
 using PtrdiffT = ::ptrdiff_t;
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/extensions/clang/clang_modules_include.gen.py
similarity index 100%
rename from libcxx/test/libcxx/clang_modules_include.gen.py
rename to libcxx/test/extensions/clang/clang_modules_include.gen.py
diff --git a/libcxx/test/extensions/clang/lit.local.cfg b/libcxx/test/extensions/clang/lit.local.cfg
new file mode 100644
index 0000000000000..b0a1c7d9b17e4
--- /dev/null
+++ b/libcxx/test/extensions/clang/lit.local.cfg
@@ -0,0 +1,4 @@
+
+# Only libc++ supports clang-specific extensions
+if "stdlib=libc++" not in config.available_features:
+    config.unsupported = True
diff --git a/libcxx/test/libcxx/include_as_c.sh.cpp b/libcxx/test/extensions/libcxx/include_as_c.sh.cpp
similarity index 100%
rename from libcxx/test/libcxx/include_as_c.sh.cpp
rename to libcxx/test/extensions/libcxx/include_as_c.sh.cpp
diff --git a/libcxx/test/libcxx/libcpp_version.gen.py b/libcxx/test/extensions/libcxx/libcpp_version.gen.py
similarity index 93%
rename from libcxx/test/libcxx/libcpp_version.gen.py
rename to libcxx/test/extensions/libcxx/libcpp_version.gen.py
index b30623fe2c388..ebeab777c934f 100644
--- a/libcxx/test/libcxx/libcpp_version.gen.py
+++ b/libcxx/test/extensions/libcxx/libcpp_version.gen.py
@@ -6,7 +6,7 @@
 #
 # ===----------------------------------------------------------------------===##
 
-# Test that all headers define the _LIBCPP_VERSION macro.
+# Test that all public headers define the _LIBCPP_VERSION macro.
 
 # RUN: %{python} %s %{libcxx-dir}/utils
 
diff --git a/libcxx/test/extensions/libcxx/lit.local.cfg b/libcxx/test/extensions/libcxx/lit.local.cfg
new file mode 100644
index 0000000000000..2b128105558b2
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/lit.local.cfg
@@ -0,0 +1,3 @@
+
+if "stdlib=libc++" not in config.available_features:
+    config.unsupported = True
diff --git a/libcxx/test/libcxx/no_assert_include.gen.py b/libcxx/test/extensions/libcxx/no_assert_include.gen.py
similarity index 100%
rename from libcxx/test/libcxx/no_assert_include.gen.py
rename to libcxx/test/extensions/libcxx/no_assert_include.gen.py
diff --git a/libcxx/test/selftest/lit.local.cfg b/libcxx/test/selftest/lit.local.cfg
deleted file mode 100644
index 4467d8070cc70..0000000000000
--- a/libcxx/test/selftest/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-# The tests in this directory need to run Python
-import shlex
-import sys
-
-config.substitutions.append(("%{python}", shlex.quote(sys.executable)))
diff --git a/libcxx/test/libcxx/double_include.gen.py b/libcxx/test/std/double_include.gen.py
similarity index 93%
rename from libcxx/test/libcxx/double_include.gen.py
rename to libcxx/test/std/double_include.gen.py
index f58e72f94a353..fcf3b9a8fa2e0 100644
--- a/libcxx/test/libcxx/double_include.gen.py
+++ b/libcxx/test/std/double_include.gen.py
@@ -28,6 +28,9 @@
 {lit_header_restrictions.get(header, '')}
 {lit_header_undeprecations.get(header, '')}
 
+// We're using compiler-specific flags in this test
+// REQUIRES: (gcc || clang)
+
 // RUN: %{{cxx}} -c %s -o %t.first.o %{{flags}} %{{compile_flags}}
 // RUN: %{{cxx}} -c %s -o %t.second.o -DWITH_MAIN %{{flags}} %{{compile_flags}}
 // RUN: %{{cxx}} -o %t.exe %t.first.o %t.second.o %{{flags}} %{{link_flags}}
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/std/header_inclusions.gen.py
similarity index 91%
rename from libcxx/test/libcxx/header_inclusions.gen.py
rename to libcxx/test/std/header_inclusions.gen.py
index e00cf180d17ad..8ff93810069fa 100644
--- a/libcxx/test/libcxx/header_inclusions.gen.py
+++ b/libcxx/test/std/header_inclusions.gen.py
@@ -48,6 +48,9 @@
 //--- {header}.compile.pass.cpp
 // UNSUPPORTED: FROZEN-CXX03-HEADERS-FIXME
 
+// TODO: This is currently a libc++-specific way of testing the includes, but is a requirement for all implementation
+// REQUIRES: stdlib=libc++
+
 {lit_header_restrictions.get(header, '')}
 {lit_header_undeprecations.get(header, '')}
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
index 8e57e8913dcbe..09086a4c046d6 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
@@ -7,6 +7,9 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
+// The Clang version that Android currently uses in the CI is too old.
+// XFAIL: LIBCXX-ANDROID-FIXME
+
 // type_traits
 
 // is_bounded_array<T>
diff --git a/lld/Common/DriverDispatcher.cpp b/lld/Common/DriverDispatcher.cpp
index fe18c320983fa..34f0ed24b3df0 100644
--- a/lld/Common/DriverDispatcher.cpp
+++ b/lld/Common/DriverDispatcher.cpp
@@ -45,7 +45,7 @@ static cl::TokenizerCallback getDefaultQuotingStyle() {
 
 static bool isPETargetName(StringRef s) {
   return s == "i386pe" || s == "i386pep" || s == "thumb2pe" || s == "arm64pe" ||
-         s == "arm64ecpe";
+         s == "arm64ecpe" || s == "arm64xpe";
 }
 
 static std::optional<bool> isPETarget(llvm::ArrayRef<const char *> args) {
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 98d48bdfcf311..5098dbd77b4fd 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -448,6 +448,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
       add("-machine:arm64");
     else if (s == "arm64ecpe")
       add("-machine:arm64ec");
+    else if (s == "arm64xpe")
+      add("-machine:arm64x");
     else
       error("unknown parameter: -m" + s);
   }
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 907d2d87dda5c..618b888504320 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -31,6 +31,12 @@ ARM64EC-SAME: -machine:arm64ec
 ARM64EC-SAME: -alternatename:__image_base__=__ImageBase
 ARM64EC-SAME: foo.o
 
+RUN: ld.lld -### foo.o -m arm64xpe 2>&1 | FileCheck -check-prefix=ARM64X %s
+ARM64X:      -out:a.exe
+ARM64X-SAME: -machine:arm64x
+ARM64X-SAME: -alternatename:__image_base__=__ImageBase
+ARM64X-SAME: foo.o
+
 RUN: ld.lld -### foo.o -m i386pep -shared 2>&1 | FileCheck -check-prefix=SHARED %s
 RUN: ld.lld -### foo.o -m i386pep --shared 2>&1 | FileCheck -check-prefix=SHARED %s
 RUN: ld.lld -### foo.o -m i386pep --dll 2>&1 | FileCheck -check-prefix=SHARED %s
diff --git a/lldb/include/lldb/Breakpoint/Breakpoint.h b/lldb/include/lldb/Breakpoint/Breakpoint.h
index b200a1e4893df..26a5e901a0d7e 100644
--- a/lldb/include/lldb/Breakpoint/Breakpoint.h
+++ b/lldb/include/lldb/Breakpoint/Breakpoint.h
@@ -397,16 +397,12 @@ class Breakpoint : public std::enable_shared_from_this<Breakpoint>,
   /// Set the breakpoint's condition.
   ///
   /// \param[in] condition
-  ///    The condition expression to evaluate when the breakpoint is hit.
-  ///    Pass in nullptr to clear the condition.
-  void SetCondition(const char *condition);
+  ///    The condition to evaluate when the breakpoint is hit.
+  ///    Pass in an empty condition to clear the condition.
+  void SetCondition(StopCondition condition);
 
-  /// Return a pointer to the text of the condition expression.
-  ///
-  /// \return
-  ///    A pointer to the condition expression text, or nullptr if no
-  //     condition has been set.
-  const char *GetConditionText() const;
+  /// Return the breakpoint condition.
+  const StopCondition &GetCondition() const;
 
   // The next section are various utility functions.
 
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocation.h b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
index ce3a21f92bd46..ab2e5e170559d 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocation.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
@@ -128,15 +128,11 @@ class BreakpointLocation
   /// Set the breakpoint location's condition.
   ///
   /// \param[in] condition
-  ///    The condition expression to evaluate when the breakpoint is hit.
-  void SetCondition(const char *condition);
+  ///    The condition to evaluate when the breakpoint is hit.
+  void SetCondition(StopCondition condition);
 
-  /// Return a pointer to the text of the condition expression.
-  ///
-  /// \return
-  ///    A pointer to the condition expression text, or nullptr if no
-  //     condition has been set.
-  const char *GetConditionText(size_t *hash = nullptr) const;
+  /// Return the breakpoint condition.
+  const StopCondition &GetCondition() const;
 
   bool ConditionSaysStop(ExecutionContext &exe_ctx, Status &error);
 
diff --git a/lldb/include/lldb/Breakpoint/BreakpointOptions.h b/lldb/include/lldb/Breakpoint/BreakpointOptions.h
index 7bf545717422f..2f73473c07e62 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointOptions.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointOptions.h
@@ -12,6 +12,7 @@
 #include <memory>
 #include <string>
 
+#include "lldb/Breakpoint/StopCondition.h"
 #include "lldb/Utility/Baton.h"
 #include "lldb/Utility/Flags.h"
 #include "lldb/Utility/StringList.h"
@@ -245,18 +246,15 @@ friend class Breakpoint;
   const Baton *GetBaton() const;
 
   // Condition
-  /// Set the breakpoint option's condition.
+  /// Set the breakpoint stop condition.
   ///
   /// \param[in] condition
-  ///    The condition expression to evaluate when the breakpoint is hit.
-  void SetCondition(const char *condition);
+  ///    The condition to evaluate when the breakpoint is hit.
+  void SetCondition(StopCondition condition);
 
-  /// Return a pointer to the text of the condition expression.
-  ///
-  /// \return
-  ///    A pointer to the condition expression text, or nullptr if no
-  //     condition has been set.
-  const char *GetConditionText(size_t *hash = nullptr) const;
+  /// Return the breakpoint condition.
+  const StopCondition &GetCondition() const;
+  StopCondition &GetCondition();
 
   // Enabled/Ignore Count
 
@@ -390,9 +388,7 @@ friend class Breakpoint;
   /// Thread for which this breakpoint will stop.
   std::unique_ptr<ThreadSpec> m_thread_spec_up;
   /// The condition to test.
-  std::string m_condition_text;
-  /// Its hash, so that locations know when the condition is updated.
-  size_t m_condition_text_hash;
+  StopCondition m_condition;
   /// If set, inject breakpoint condition into process.
   bool m_inject_condition;
   /// If set, auto-continue from breakpoint.
diff --git a/lldb/include/lldb/Breakpoint/StopCondition.h b/lldb/include/lldb/Breakpoint/StopCondition.h
new file mode 100644
index 0000000000000..485a615368400
--- /dev/null
+++ b/lldb/include/lldb/Breakpoint/StopCondition.h
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_BREAKPOINT_STOPCONDITION_H
+#define LLDB_BREAKPOINT_STOPCONDITION_H
+
+#include "lldb/lldb-private.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lldb_private {
+
+class StopCondition {
+public:
+  StopCondition() = default;
+  StopCondition(std::string text,
+                lldb::LanguageType language = lldb::eLanguageTypeUnknown)
+      : m_language(language) {
+    SetText(std::move(text));
+  }
+
+  explicit operator bool() const { return !m_text.empty(); }
+
+  llvm::StringRef GetText() const { return m_text; }
+
+  void SetText(std::string text) {
+    static std::hash<std::string> hasher;
+    m_text = std::move(text);
+    m_hash = hasher(text);
+  }
+
+  size_t GetHash() const { return m_hash; }
+
+  lldb::LanguageType GetLanguage() const { return m_language; }
+
+  void SetLanguage(lldb::LanguageType language) { m_language = language; }
+
+private:
+  /// The condition to test.
+  std::string m_text;
+
+  /// Its hash, so that locations know when the condition is updated.
+  size_t m_hash = 0;
+
+  /// The language for this condition.
+  lldb::LanguageType m_language = lldb::eLanguageTypeUnknown;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_BREAKPOINT_STOPCONDITION_H
diff --git a/lldb/include/lldb/Host/HostThread.h b/lldb/include/lldb/Host/HostThread.h
index d3477e115e2d8..c969492f5b20a 100644
--- a/lldb/include/lldb/Host/HostThread.h
+++ b/lldb/include/lldb/Host/HostThread.h
@@ -43,6 +43,8 @@ class HostThread {
 
   bool EqualsThread(lldb::thread_t thread) const;
 
+  bool HasThread() const;
+
 private:
   std::shared_ptr<HostNativeThreadBase> m_native_thread;
 };
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index a8892e9c43225..637b0774ec7db 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -2547,6 +2547,8 @@ void PruneThreadPlans();
 
   bool CurrentThreadIsPrivateStateThread();
 
+  bool CurrentThreadPosesAsPrivateStateThread();
+
   virtual Status SendEventData(const char *data) {
     return Status::FromErrorString(
         "Sending an event is not supported for this process.");
diff --git a/lldb/include/lldb/Utility/LLDBLog.h b/lldb/include/lldb/Utility/LLDBLog.h
index c7de41e74e85b..18e4a3ca73507 100644
--- a/lldb/include/lldb/Utility/LLDBLog.h
+++ b/lldb/include/lldb/Utility/LLDBLog.h
@@ -49,7 +49,8 @@ enum class LLDBLog : Log::MaskType {
   Watchpoints = Log::ChannelFlag<30>,
   OnDemand = Log::ChannelFlag<31>,
   Source = Log::ChannelFlag<32>,
-  LLVM_MARK_AS_BITMASK_ENUM(OnDemand),
+  Disassembler = Log::ChannelFlag<33>,
+  LLVM_MARK_AS_BITMASK_ENUM(Disassembler),
 };
 
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 68f58bf1349a7..d9516670e3a89 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -908,7 +908,7 @@ def request_launch(
         disableASLR=False,
         disableSTDIO=False,
         shellExpandArguments=False,
-        runInTerminal=False,
+        console: Optional[str] = None,
         enableAutoVariableSummaries=False,
         displayExtendedBacktrace=False,
         enableSyntheticChildDebugging=False,
@@ -958,8 +958,8 @@ def request_launch(
             args_dict["launchCommands"] = launchCommands
         if sourceMap:
             args_dict["sourceMap"] = sourceMap
-        if runInTerminal:
-            args_dict["runInTerminal"] = runInTerminal
+        if console:
+            args_dict["console"] = console
         if postRunCommands:
             args_dict["postRunCommands"] = postRunCommands
         if customFrameFormat:
diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp
index 397afc1f10f94..07c0a2ea907ba 100644
--- a/lldb/source/API/SBBreakpoint.cpp
+++ b/lldb/source/API/SBBreakpoint.cpp
@@ -275,7 +275,7 @@ void SBBreakpoint::SetCondition(const char *condition) {
   if (bkpt_sp) {
     std::lock_guard<std::recursive_mutex> guard(
         bkpt_sp->GetTarget().GetAPIMutex());
-    bkpt_sp->SetCondition(condition);
+    bkpt_sp->SetCondition(StopCondition(condition));
   }
 }
 
@@ -288,7 +288,7 @@ const char *SBBreakpoint::GetCondition() {
 
   std::lock_guard<std::recursive_mutex> guard(
       bkpt_sp->GetTarget().GetAPIMutex());
-  return ConstString(bkpt_sp->GetConditionText()).GetCString();
+  return ConstString(bkpt_sp->GetCondition().GetText()).GetCString();
 }
 
 void SBBreakpoint::SetAutoContinue(bool auto_continue) {
diff --git a/lldb/source/API/SBBreakpointLocation.cpp b/lldb/source/API/SBBreakpointLocation.cpp
index 479354a62627d..e786435c4f8af 100644
--- a/lldb/source/API/SBBreakpointLocation.cpp
+++ b/lldb/source/API/SBBreakpointLocation.cpp
@@ -160,7 +160,7 @@ void SBBreakpointLocation::SetCondition(const char *condition) {
   if (loc_sp) {
     std::lock_guard<std::recursive_mutex> guard(
         loc_sp->GetTarget().GetAPIMutex());
-    loc_sp->SetCondition(condition);
+    loc_sp->SetCondition(StopCondition(condition));
   }
 }
 
@@ -173,7 +173,7 @@ const char *SBBreakpointLocation::GetCondition() {
 
   std::lock_guard<std::recursive_mutex> guard(
       loc_sp->GetTarget().GetAPIMutex());
-  return ConstString(loc_sp->GetConditionText()).GetCString();
+  return ConstString(loc_sp->GetCondition().GetText()).GetCString();
 }
 
 void SBBreakpointLocation::SetAutoContinue(bool auto_continue) {
diff --git a/lldb/source/API/SBBreakpointName.cpp b/lldb/source/API/SBBreakpointName.cpp
index 831260d44e8e7..0b588c38d5114 100644
--- a/lldb/source/API/SBBreakpointName.cpp
+++ b/lldb/source/API/SBBreakpointName.cpp
@@ -303,7 +303,7 @@ void SBBreakpointName::SetCondition(const char *condition) {
   std::lock_guard<std::recursive_mutex> guard(
         m_impl_up->GetTarget()->GetAPIMutex());
 
-  bp_name->GetOptions().SetCondition(condition);
+  bp_name->GetOptions().SetCondition(StopCondition(condition));
   UpdateName(*bp_name);
 }
 
@@ -317,7 +317,8 @@ const char *SBBreakpointName::GetCondition() {
   std::lock_guard<std::recursive_mutex> guard(
       m_impl_up->GetTarget()->GetAPIMutex());
 
-  return ConstString(bp_name->GetOptions().GetConditionText()).GetCString();
+  return ConstString(bp_name->GetOptions().GetCondition().GetText())
+      .GetCString();
 }
 
 void SBBreakpointName::SetAutoContinue(bool auto_continue) {
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index ec27a7dc7b41f..d757bc41cdc32 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -440,13 +440,13 @@ const char *Breakpoint::GetQueueName() const {
   return m_options.GetThreadSpecNoCreate()->GetQueueName();
 }
 
-void Breakpoint::SetCondition(const char *condition) {
-  m_options.SetCondition(condition);
+void Breakpoint::SetCondition(StopCondition condition) {
+  m_options.SetCondition(std::move(condition));
   SendBreakpointChangedEvent(eBreakpointEventTypeConditionChanged);
 }
 
-const char *Breakpoint::GetConditionText() const {
-  return m_options.GetConditionText();
+const StopCondition &Breakpoint::GetCondition() const {
+  return m_options.GetCondition();
 }
 
 // This function is used when "baton" doesn't need to be freed
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 7ac9c8f5ddc4d..443d4f50833d3 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -203,14 +203,13 @@ void BreakpointLocation::ClearCallback() {
   GetLocationOptions().ClearCallback();
 }
 
-void BreakpointLocation::SetCondition(const char *condition) {
-  GetLocationOptions().SetCondition(condition);
+void BreakpointLocation::SetCondition(StopCondition condition) {
+  GetLocationOptions().SetCondition(std::move(condition));
   SendBreakpointLocationChangedEvent(eBreakpointEventTypeConditionChanged);
 }
 
-const char *BreakpointLocation::GetConditionText(size_t *hash) const {
-  return GetOptionsSpecifyingKind(BreakpointOptions::eCondition)
-      .GetConditionText(hash);
+const StopCondition &BreakpointLocation::GetCondition() const {
+  return GetOptionsSpecifyingKind(BreakpointOptions::eCondition).GetCondition();
 }
 
 bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
@@ -219,10 +218,9 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
 
   std::lock_guard<std::mutex> guard(m_condition_mutex);
 
-  size_t condition_hash;
-  const char *condition_text = GetConditionText(&condition_hash);
+  StopCondition condition = GetCondition();
 
-  if (!condition_text) {
+  if (!condition) {
     m_user_expression_sp.reset();
     return false;
   }
@@ -231,19 +229,22 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
 
   DiagnosticManager diagnostics;
 
-  if (condition_hash != m_condition_hash || !m_user_expression_sp ||
+  if (condition.GetHash() != m_condition_hash || !m_user_expression_sp ||
       !m_user_expression_sp->IsParseCacheable() ||
       !m_user_expression_sp->MatchesContext(exe_ctx)) {
-    LanguageType language = eLanguageTypeUnknown;
-    // See if we can figure out the language from the frame, otherwise use the
-    // default language:
-    CompileUnit *comp_unit = m_address.CalculateSymbolContextCompileUnit();
-    if (comp_unit)
-      language = comp_unit->GetLanguage();
+    LanguageType language = condition.GetLanguage();
+    if (language == lldb::eLanguageTypeUnknown) {
+      // See if we can figure out the language from the frame, otherwise use the
+      // default language:
+      if (CompileUnit *comp_unit =
+              m_address.CalculateSymbolContextCompileUnit())
+        language = comp_unit->GetLanguage();
+    }
 
     m_user_expression_sp.reset(GetTarget().GetUserExpressionForLanguage(
-        condition_text, llvm::StringRef(), language, Expression::eResultTypeAny,
-        EvaluateExpressionOptions(), nullptr, error));
+        condition.GetText(), llvm::StringRef(), language,
+        Expression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
+        error));
     if (error.Fail()) {
       LLDB_LOGF(log, "Error getting condition expression: %s.",
                 error.AsCString());
@@ -262,7 +263,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
       return true;
     }
 
-    m_condition_hash = condition_hash;
+    m_condition_hash = condition.GetHash();
   }
 
   // We need to make sure the user sees any parse errors in their condition, so
diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp
index 08e48c4921078..b0b794f0f93bf 100644
--- a/lldb/source/Breakpoint/BreakpointOptions.cpp
+++ b/lldb/source/Breakpoint/BreakpointOptions.cpp
@@ -106,8 +106,8 @@ const char *BreakpointOptions::g_option_names[(
 BreakpointOptions::BreakpointOptions(bool all_flags_set)
     : m_callback(nullptr), m_baton_is_command_baton(false),
       m_callback_is_synchronous(false), m_enabled(true), m_one_shot(false),
-      m_ignore_count(0), m_condition_text_hash(0), m_inject_condition(false),
-      m_auto_continue(false), m_set_flags(0) {
+      m_ignore_count(0), m_inject_condition(false), m_auto_continue(false),
+      m_set_flags(0) {
   if (all_flags_set)
     m_set_flags.Set(~((Flags::ValueType)0));
 }
@@ -117,11 +117,11 @@ BreakpointOptions::BreakpointOptions(const char *condition, bool enabled,
                                      bool auto_continue)
     : m_callback(nullptr), m_baton_is_command_baton(false),
       m_callback_is_synchronous(false), m_enabled(enabled),
-      m_one_shot(one_shot), m_ignore_count(ignore), m_condition_text_hash(0),
+      m_one_shot(one_shot), m_ignore_count(ignore), m_condition(condition),
       m_inject_condition(false), m_auto_continue(auto_continue) {
   m_set_flags.Set(eEnabled | eIgnoreCount | eOneShot | eAutoContinue);
     if (condition && *condition != '\0') {
-      SetCondition(condition);
+      SetCondition(StopCondition(condition));
     }
 }
 
@@ -135,8 +135,7 @@ BreakpointOptions::BreakpointOptions(const BreakpointOptions &rhs)
       m_auto_continue(rhs.m_auto_continue), m_set_flags(rhs.m_set_flags) {
   if (rhs.m_thread_spec_up != nullptr)
     m_thread_spec_up = std::make_unique<ThreadSpec>(*rhs.m_thread_spec_up);
-  m_condition_text = rhs.m_condition_text;
-  m_condition_text_hash = rhs.m_condition_text_hash;
+  m_condition = rhs.m_condition;
 }
 
 // BreakpointOptions assignment operator
@@ -151,8 +150,7 @@ operator=(const BreakpointOptions &rhs) {
   m_ignore_count = rhs.m_ignore_count;
   if (rhs.m_thread_spec_up != nullptr)
     m_thread_spec_up = std::make_unique<ThreadSpec>(*rhs.m_thread_spec_up);
-  m_condition_text = rhs.m_condition_text;
-  m_condition_text_hash = rhs.m_condition_text_hash;
+  m_condition = rhs.m_condition;
   m_inject_condition = rhs.m_inject_condition;
   m_auto_continue = rhs.m_auto_continue;
   m_set_flags = rhs.m_set_flags;
@@ -187,13 +185,11 @@ void BreakpointOptions::CopyOverSetOptions(const BreakpointOptions &incoming)
   if (incoming.m_set_flags.Test(eCondition))
   {
     // If we're copying over an empty condition, mark it as unset.
-    if (incoming.m_condition_text.empty()) {
-      m_condition_text.clear();
-      m_condition_text_hash = 0;
+    if (!incoming.m_condition) {
+      m_condition = StopCondition();
       m_set_flags.Clear(eCondition);
     } else {
-      m_condition_text = incoming.m_condition_text;
-      m_condition_text_hash = incoming.m_condition_text_hash;
+      m_condition = incoming.m_condition;
       m_set_flags.Set(eCondition);
     }
   }
@@ -363,7 +359,7 @@ StructuredData::ObjectSP BreakpointOptions::SerializeToStructuredData() {
                                     m_ignore_count);
   if (m_set_flags.Test(eCondition))
     options_dict_sp->AddStringItem(GetKey(OptionNames::ConditionText),
-                                   m_condition_text);
+                                   m_condition.GetText());
 
   if (m_set_flags.Test(eCallback) && m_baton_is_command_baton) {
     auto cmd_baton =
@@ -464,29 +460,21 @@ bool BreakpointOptions::GetCommandLineCallbacks(StringList &command_list) {
   return true;
 }
 
-void BreakpointOptions::SetCondition(const char *condition) {
-  if (!condition || condition[0] == '\0') {
-    condition = "";
+void BreakpointOptions::SetCondition(StopCondition condition) {
+  if (!condition)
     m_set_flags.Clear(eCondition);
-  }
   else
     m_set_flags.Set(eCondition);
 
-  m_condition_text.assign(condition);
-  std::hash<std::string> hasher;
-  m_condition_text_hash = hasher(m_condition_text);
+  m_condition = std::move(condition);
 }
 
-const char *BreakpointOptions::GetConditionText(size_t *hash) const {
-  if (!m_condition_text.empty()) {
-    if (hash)
-      *hash = m_condition_text_hash;
-
-    return m_condition_text.c_str();
-  }
-  return nullptr;
+const StopCondition &BreakpointOptions::GetCondition() const {
+  return m_condition;
 }
 
+StopCondition &BreakpointOptions::GetCondition() { return m_condition; }
+
 const ThreadSpec *BreakpointOptions::GetThreadSpecNoCreate() const {
   return m_thread_spec_up.get();
 }
@@ -555,10 +543,10 @@ void BreakpointOptions::GetDescription(Stream *s,
                                           s->GetIndentLevel());
     }
   }
-  if (!m_condition_text.empty()) {
+  if (m_condition) {
     if (level != eDescriptionLevelBrief) {
       s->EOL();
-      s->Printf("Condition: %s\n", m_condition_text.c_str());
+      s->Printf("Condition: %s\n", m_condition.GetText().data());
     }
   }
 }
@@ -652,5 +640,5 @@ void BreakpointOptions::Clear()
   m_baton_is_command_baton = false;
   m_callback_is_synchronous = false;
   m_enabled = false;
-  m_condition_text.clear();
+  m_condition = StopCondition();
 }
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index 2440a7e46e961..38ec375c03070 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -72,7 +72,7 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup {
     case 'c':
       // Normally an empty breakpoint condition marks is as unset. But we need
       // to say it was passed in.
-      m_bp_opts.SetCondition(option_arg.str().c_str());
+      m_bp_opts.GetCondition().SetText(option_arg.str());
       m_bp_opts.m_set_flags.Set(BreakpointOptions::eCondition);
       break;
     case 'C':
@@ -154,6 +154,21 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup {
         m_bp_opts.GetThreadSpec()->SetIndex(thread_index);
       }
     } break;
+    case 'Y': {
+      LanguageType language = Language::GetLanguageTypeFromString(option_arg);
+
+      LanguageSet languages_for_expressions =
+          Language::GetLanguagesSupportingTypeSystemsForExpressions();
+      if (language == eLanguageTypeUnknown)
+        error = Status::FromError(CreateOptionParsingError(
+            option_arg, short_option, long_option, "invalid language"));
+      else if (!languages_for_expressions[language])
+        error = Status::FromError(
+            CreateOptionParsingError(option_arg, short_option, long_option,
+                                     "no expression support for language"));
+      else
+        m_bp_opts.GetCondition().SetLanguage(language);
+    } break;
     default:
       llvm_unreachable("Unimplemented option");
     }
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index e543566e4ff1e..acb741081cac3 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -95,6 +95,12 @@ let Command = "breakpoint modify" in {
   def breakpoint_modify_condition : Option<"condition", "c">, Group<1>,
     Arg<"Expression">, Desc<"The breakpoint stops only if this condition "
     "expression evaluates to true.">;
+  def breakpoint_modify_condition_language
+      : Option<"condition-language", "Y">,
+        Group<1>,
+        Arg<"Language">,
+        Desc<"Specifies the Language to use when executing the breakpoint's "
+             "condition expression.">;
   def breakpoint_modify_auto_continue : Option<"auto-continue", "G">, Group<1>,
     Arg<"Boolean">,
     Desc<"The breakpoint will auto-continue after running its commands.">;
diff --git a/lldb/source/Host/common/HostThread.cpp b/lldb/source/Host/common/HostThread.cpp
index eec029be1c091..8822be016b0a1 100644
--- a/lldb/source/Host/common/HostThread.cpp
+++ b/lldb/source/Host/common/HostThread.cpp
@@ -44,3 +44,9 @@ lldb::thread_result_t HostThread::GetResult() const {
 bool HostThread::EqualsThread(lldb::thread_t thread) const {
   return m_native_thread->EqualsThread(thread);
 }
+
+bool HostThread::HasThread() const {
+  if (!m_native_thread)
+    return false;
+  return m_native_thread->GetSystemHandle() != LLDB_INVALID_HOST_THREAD;
+}
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index ed6047f8f4ef3..644084ba8d57a 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -1146,7 +1146,7 @@ class InstructionLLVMC : public lldb_private::Instruction {
       }
     }
 
-    if (Log *log = GetLog(LLDBLog::Process)) {
+    if (Log *log = GetLog(LLDBLog::Process | LLDBLog::Disassembler)) {
       StreamString ss;
 
       ss.Printf("[%s] expands to %zu operands:\n", operands_string,
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 3ec3cad4b8178..296159ea28407 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -34,6 +34,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibStdcppTuple.cpp
   LibStdcppUniquePointer.cpp
   MsvcStl.cpp
+  MsvcStlSmartPointer.cpp
   MSVCUndecoratedNameParser.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 17963c0273ba8..2db3e6f0ca315 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1540,16 +1540,6 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       lldb_private::formatters::LibStdcppUniquePtrSyntheticFrontEndCreator,
       "std::unique_ptr synthetic children", "^std::unique_ptr<.+>(( )?&)?$",
       stl_synth_flags, true);
-  AddCXXSynthetic(
-      cpp_category_sp,
-      lldb_private::formatters::LibStdcppSharedPtrSyntheticFrontEndCreator,
-      "std::shared_ptr synthetic children", "^std::shared_ptr<.+>(( )?&)?$",
-      stl_synth_flags, true);
-  AddCXXSynthetic(
-      cpp_category_sp,
-      lldb_private::formatters::LibStdcppSharedPtrSyntheticFrontEndCreator,
-      "std::weak_ptr synthetic children", "^std::weak_ptr<.+>(( )?&)?$",
-      stl_synth_flags, true);
   AddCXXSynthetic(
       cpp_category_sp,
       lldb_private::formatters::LibStdcppTupleSyntheticFrontEndCreator,
@@ -1580,14 +1570,6 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibStdcppUniquePointerSummaryProvider,
                 "libstdc++ std::unique_ptr summary provider",
                 "^std::unique_ptr<.+>(( )?&)?$", stl_summary_flags, true);
-  AddCXXSummary(cpp_category_sp,
-                lldb_private::formatters::LibStdcppSmartPointerSummaryProvider,
-                "libstdc++ std::shared_ptr summary provider",
-                "^std::shared_ptr<.+>(( )?&)?$", stl_summary_flags, true);
-  AddCXXSummary(cpp_category_sp,
-                lldb_private::formatters::LibStdcppSmartPointerSummaryProvider,
-                "libstdc++ std::weak_ptr summary provider",
-                "^std::weak_ptr<.+>(( )?&)?$", stl_summary_flags, true);
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::StdlibCoroutineHandleSummaryProvider,
                 "libstdc++ std::coroutine_handle summary provider",
@@ -1598,6 +1580,25 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 "^std::optional<.+>(( )?&)?$", stl_summary_flags, true);
 }
 
+static lldb_private::SyntheticChildrenFrontEnd *
+GenericSmartPointerSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                            lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlSmartPointer(*valobj_sp))
+    return MsvcStlSmartPointerSyntheticFrontEndCreator(valobj_sp);
+  return LibStdcppSharedPtrSyntheticFrontEndCreator(children, valobj_sp);
+}
+
+static bool
+GenericSmartPointerSummaryProvider(ValueObject &valobj, Stream &stream,
+                                   const TypeSummaryOptions &options) {
+  if (IsMsvcStlSmartPointer(valobj))
+    return MsvcStlSmartPointerSummaryProvider(valobj, stream, options);
+  return LibStdcppSmartPointerSummaryProvider(valobj, stream, options);
+}
+
 /// Load formatters that are formatting types from more than one STL
 static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   if (!cpp_category_sp)
@@ -1611,6 +1612,10 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       .SetDontShowValue(false)
       .SetShowMembersOneLiner(false)
       .SetHideItemNames(false);
+  SyntheticChildren::Flags stl_synth_flags;
+  stl_synth_flags.SetCascades(true).SetSkipPointers(false).SetSkipReferences(
+      false);
+
   using StringElementType = StringPrinter::StringElementType;
 
   RegisterStdStringSummaryProvider(
@@ -1636,6 +1641,20 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
             return LibStdcppStringSummaryProvider(valobj, stream, options);
           },
           "MSVC STL/libstdc++ std::wstring summary provider"));
+
+  AddCXXSynthetic(cpp_category_sp, GenericSmartPointerSyntheticFrontEndCreator,
+                  "std::shared_ptr synthetic children",
+                  "^std::shared_ptr<.+>(( )?&)?$", stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericSmartPointerSyntheticFrontEndCreator,
+                  "std::weak_ptr synthetic children",
+                  "^std::weak_ptr<.+>(( )?&)?$", stl_synth_flags, true);
+
+  AddCXXSummary(cpp_category_sp, GenericSmartPointerSummaryProvider,
+                "MSVC STL/libstdc++ std::shared_ptr summary provider",
+                "^std::shared_ptr<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, GenericSmartPointerSummaryProvider,
+                "MSVC STL/libstdc++ std::weak_ptr summary provider",
+                "^std::weak_ptr<.+>(( )?&)?$", stl_summary_flags, true);
 }
 
 static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Generic.cpp b/lldb/source/Plugins/Language/CPlusPlus/Generic.cpp
index b237a8a27090c..bfe86e4665f65 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Generic.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/Generic.cpp
@@ -7,6 +7,8 @@
 //===---------------------------------------------------------------------===//
 
 #include "Generic.h"
+#include "LibStdcpp.h"
+#include "MsvcStl.h"
 
 lldb::ValueObjectSP lldb_private::formatters::GetDesugaredSmartPointerValue(
     ValueObject &ptr, ValueObject &container) {
@@ -16,7 +18,8 @@ lldb::ValueObjectSP lldb_private::formatters::GetDesugaredSmartPointerValue(
 
   auto arg = container_type.GetTypeTemplateArgument(0);
   if (!arg)
-    return nullptr;
+    // If there isn't enough debug info, use the pointer type as is
+    return ptr.GetSP();
 
   return ptr.Cast(arg.GetPointerType());
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
index e4ed923033aa7..edf3f4e8a5387 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
@@ -29,6 +29,14 @@ bool MsvcStlWStringSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // VC 2015+ std::wstring
 
+// MSVC STL std::shared_ptr<> and std::weak_ptr<>
+bool IsMsvcStlSmartPointer(ValueObject &valobj);
+bool MsvcStlSmartPointerSummaryProvider(ValueObject &valobj, Stream &stream,
+                                        const TypeSummaryOptions &options);
+
+lldb_private::SyntheticChildrenFrontEnd *
+MsvcStlSmartPointerSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlSmartPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlSmartPointer.cpp
new file mode 100644
index 0000000000000..b1aecc4b6611a
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlSmartPointer.cpp
@@ -0,0 +1,165 @@
+//===-- MsvcStlSmartPointer.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Generic.h"
+#include "MsvcStl.h"
+
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include "lldb/DataFormatters/TypeSynthetic.h"
+
+using namespace lldb;
+
+bool lldb_private::formatters::IsMsvcStlSmartPointer(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_Ptr") != nullptr;
+
+  return false;
+}
+
+bool lldb_private::formatters::MsvcStlSmartPointerSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP valobj_sp(valobj.GetNonSyntheticValue());
+  if (!valobj_sp)
+    return false;
+
+  ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("_Ptr"));
+  ValueObjectSP ctrl_sp(valobj_sp->GetChildMemberWithName("_Rep"));
+  if (!ctrl_sp || !ptr_sp)
+    return false;
+
+  DumpCxxSmartPtrPointerSummary(stream, *ptr_sp, options);
+
+  bool success;
+  uint64_t ctrl_addr = ctrl_sp->GetValueAsUnsigned(0, &success);
+  // Empty control field (expired)
+  if (!success || ctrl_addr == 0)
+    return true;
+
+  uint64_t uses = 0;
+  if (auto uses_sp = ctrl_sp->GetChildMemberWithName("_Uses")) {
+    bool success;
+    uses = uses_sp->GetValueAsUnsigned(0, &success);
+    if (!success)
+      return false;
+
+    stream.Printf(" strong=%" PRIu64, uses);
+  }
+
+  // _Weaks is the number of weak references - (_Uses != 0).
+  if (auto weak_count_sp = ctrl_sp->GetChildMemberWithName("_Weaks")) {
+    bool success;
+    uint64_t count = weak_count_sp->GetValueAsUnsigned(0, &success);
+    if (!success)
+      return false;
+
+    stream.Printf(" weak=%" PRIu64, count - (uses != 0));
+  }
+
+  return true;
+}
+
+namespace lldb_private {
+namespace formatters {
+
+class MsvcStlSmartPointerSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlSmartPointerSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+  ~MsvcStlSmartPointerSyntheticFrontEnd() override;
+
+private:
+  ValueObject *m_ptr_obj = nullptr;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEnd::
+    MsvcStlSmartPointerSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp) {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlSmartPointerSyntheticFrontEnd::CalculateNumChildren() {
+  return (m_ptr_obj ? 1 : 0);
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (!m_ptr_obj)
+    return lldb::ValueObjectSP();
+
+  ValueObjectSP valobj_sp = m_backend.GetSP();
+  if (!valobj_sp)
+    return lldb::ValueObjectSP();
+
+  if (idx == 0)
+    return m_ptr_obj->GetSP();
+
+  if (idx == 1) {
+    Status status;
+    ValueObjectSP value_sp = m_ptr_obj->Dereference(status);
+    if (status.Success())
+      return value_sp;
+  }
+
+  return lldb::ValueObjectSP();
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEnd::Update() {
+  m_ptr_obj = nullptr;
+
+  ValueObjectSP valobj_sp = m_backend.GetSP();
+  if (!valobj_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  auto ptr_obj_sp = valobj_sp->GetChildMemberWithName("_Ptr");
+  if (!ptr_obj_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  auto cast_ptr_sp = GetDesugaredSmartPointerValue(*ptr_obj_sp, *valobj_sp);
+  if (!cast_ptr_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_ptr_obj = cast_ptr_sp->Clone(ConstString("pointer")).get();
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t>
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (name == "pointer")
+    return 0;
+
+  if (name == "object" || name == "$$dereference$$")
+    return 1;
+
+  return llvm::createStringError("Type has no child named '%s'",
+                                 name.AsCString());
+}
+
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEnd::
+    ~MsvcStlSmartPointerSyntheticFrontEnd() = default;
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::MsvcStlSmartPointerSyntheticFrontEndCreator(
+    lldb::ValueObjectSP valobj_sp) {
+  return new MsvcStlSmartPointerSyntheticFrontEnd(valobj_sp);
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index bba1230c79920..2aa02fd58335e 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -1271,7 +1271,7 @@ uint32_t Process::AssignIndexIDToThread(uint64_t thread_id) {
 }
 
 StateType Process::GetState() {
-  if (CurrentThreadIsPrivateStateThread())
+  if (CurrentThreadPosesAsPrivateStateThread())
     return m_private_state.GetValue();
   else
     return m_public_state.GetValue();
@@ -3144,16 +3144,19 @@ void Process::CompleteAttach() {
     }
   }
 
-  if (!m_os_up) {
+  // If we don't have an operating system plugin loaded yet, see if
+  // LoadOperatingSystemPlugin can find one (and stuff it in m_os_up).
+  if (!m_os_up)
     LoadOperatingSystemPlugin(false);
-    if (m_os_up) {
-      // Somebody might have gotten threads before now, but we need to force the
-      // update after we've loaded the OperatingSystem plugin or it won't get a
-      // chance to process the threads.
-      m_thread_list.Clear();
-      UpdateThreadListIfNeeded();
-    }
+
+  if (m_os_up) {
+    // Somebody might have gotten threads before we loaded the OS Plugin above,
+    // so we need to force the update now or the newly loaded plugin won't get
+    // a chance to process the threads.
+    m_thread_list.Clear();
+    UpdateThreadListIfNeeded();
   }
+
   // Figure out which one is the executable, and set that in our target:
   ModuleSP new_executable_module_sp;
   for (ModuleSP module_sp : GetTarget().GetImages().Modules()) {
@@ -5856,6 +5859,13 @@ bool Process::CurrentThreadIsPrivateStateThread()
   return m_private_state_thread.EqualsThread(Host::GetCurrentThread());
 }
 
+bool Process::CurrentThreadPosesAsPrivateStateThread() {
+  // If we haven't started up the private state thread yet, then whatever thread
+  // is fetching this event should be temporarily the private state thread.
+  if (!m_private_state_thread.HasThread())
+    return true;
+  return m_private_state_thread.EqualsThread(Host::GetCurrentThread());
+}
 
 void Process::Flush() {
   m_thread_list.Flush();
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index 9c6208e9e0a65..16cd2548c2784 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -723,7 +723,7 @@ void StackFrameList::SelectMostRelevantFrame() {
   // Don't call into the frame recognizers on the private state thread as
   // they can cause code to run in the target, and that can cause deadlocks
   // when fetching stop events for the expression.
-  if (m_thread.GetProcess()->CurrentThreadIsPrivateStateThread())
+  if (m_thread.GetProcess()->CurrentThreadPosesAsPrivateStateThread())
     return;
 
   Log *log = GetLog(LLDBLog::Thread);
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 3160446ae1d17..19f89b8246926 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -465,7 +465,7 @@ class StopInfoBreakpoint : public StopInfo {
             // should stop, then we'll run the callback for the breakpoint.  If
             // the callback says we shouldn't stop that will win.
 
-            if (bp_loc_sp->GetConditionText() == nullptr)
+            if (!bp_loc_sp->GetCondition())
               actually_hit_any_locations = true;
             else {
               Status condition_error;
@@ -484,7 +484,7 @@ class StopInfoBreakpoint : public StopInfo {
                 strm << "stopped due to an error evaluating condition of "
                         "breakpoint ";
                 bp_loc_sp->GetDescription(&strm, eDescriptionLevelBrief);
-                strm << ": \"" << bp_loc_sp->GetConditionText() << "\"\n";
+                strm << ": \"" << bp_loc_sp->GetCondition().GetText() << "\"\n";
                 strm << err_str;
 
                 Debugger::ReportError(
diff --git a/lldb/source/Utility/LLDBLog.cpp b/lldb/source/Utility/LLDBLog.cpp
index b193bd4eb07dc..613dae42064a8 100644
--- a/lldb/source/Utility/LLDBLog.cpp
+++ b/lldb/source/Utility/LLDBLog.cpp
@@ -64,6 +64,9 @@ static constexpr Log::Category g_categories[] = {
      {"log symbol on-demand related activities"},
      LLDBLog::OnDemand},
     {{"source"}, {"log source related activities"}, LLDBLog::Source},
+    {{"disassembler"},
+     {"log disassembler related activities"},
+     LLDBLog::Disassembler},
 };
 
 static Log::Channel g_log_channel(g_categories,
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
index 4e7a8ccb9fbeb..a4c9c49bc89b6 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py
@@ -19,6 +19,16 @@ def test_breakpoint_condition_inline_and_run_command(self):
         self.build()
         self.breakpoint_conditions(inline=True)
 
+    def test_breakpoint_condition_and_run_command_language(self):
+        """Exercise breakpoint condition with 'breakpoint modify -c <expr> id'."""
+        self.build()
+        self.breakpoint_conditions(cpp=True)
+
+    def test_breakpoint_condition_inline_and_run_command_language(self):
+        """Exercise breakpoint condition inline with 'breakpoint set'."""
+        self.build()
+        self.breakpoint_conditions(inline=True, cpp=True)
+
     @add_test_categories(["pyapi"])
     def test_breakpoint_condition_and_python_api(self):
         """Use Python APIs to set breakpoint conditions."""
@@ -42,17 +52,24 @@ def setUp(self):
             "main.c", "// Find the line number of c's parent call here."
         )
 
-    def breakpoint_conditions(self, inline=False):
+    def breakpoint_conditions(self, inline=False, cpp=False):
         """Exercise breakpoint condition with 'breakpoint modify -c <expr> id'."""
         exe = self.getBuildArtifact("a.out")
         self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET)
 
+        if cpp:
+            condition = "&val != nullptr && val == 3"
+            cmd_args = " -c '{}' -Y c++".format(condition)
+        else:
+            condition = "val == 3"
+            cmd_args = "-c '{}'".format(condition)
+
         if inline:
             # Create a breakpoint by function name 'c' and set the condition.
             lldbutil.run_break_set_by_symbol(
                 self,
                 "c",
-                extra_options="-c 'val == 3'",
+                extra_options=cmd_args,
                 num_expected_locations=1,
                 sym_exact=True,
             )
@@ -63,7 +80,7 @@ def breakpoint_conditions(self, inline=False):
             )
 
             # And set a condition on the breakpoint to stop on when 'val == 3'.
-            self.runCmd("breakpoint modify -c 'val == 3' 1")
+            self.runCmd("breakpoint modify " + cmd_args + " 1")
 
         # Now run the program.
         self.runCmd("run", RUN_SUCCEEDED)
@@ -82,7 +99,11 @@ def breakpoint_conditions(self, inline=False):
         self.expect(
             "breakpoint list -f",
             BREAKPOINT_HIT_ONCE,
-            substrs=["resolved = 1", "Condition: val == 3", "hit count = 1"],
+            substrs=[
+                "resolved = 1",
+                "Condition: {}".format(condition),
+                "hit count = 1",
+            ],
         )
 
         # The frame #0 should correspond to main.c:36, the executable statement
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/Makefile
similarity index 54%
rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/Makefile
rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/Makefile
index 564cbada74e08..99998b20bcb05 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/Makefile
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/Makefile
@@ -1,6 +1,3 @@
 CXX_SOURCES := main.cpp
 
-USE_LIBCPP := 1
-
-CXXFLAGS_EXTRAS := -O0
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/TestDataFormatterLibccIterator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/TestDataFormatterStdIterator.py
similarity index 69%
rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/TestDataFormatterLibccIterator.py
rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/TestDataFormatterStdIterator.py
index c43ee46fb658a..373b1c9a2c8e8 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/TestDataFormatterLibccIterator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/TestDataFormatterStdIterator.py
@@ -2,14 +2,13 @@
 Test lldb data formatter subsystem.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
 
-class LibcxxIteratorDataFormatterTestCase(TestBase):
+class StdIteratorDataFormatterTestCase(TestBase):
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -17,10 +16,8 @@ def setUp(self):
         self.line = line_number("main.cpp", "// Set break point at this line.")
         self.namespace = "std"
 
-    @add_test_categories(["libc++"])
-    def test_with_run_command(self):
-        """Test that libc++ iterators format properly."""
-        self.build()
+    def do_test(self):
+        """Test that iterators format properly."""
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
 
         lldbutil.run_break_set_by_file_and_line(
@@ -69,18 +66,12 @@ def cleanup():
         self.expect("frame variable svI", substrs=['item = "hello"'])
         self.expect("expr svI", substrs=['item = "hello"'])
 
-        self.expect("frame variable iiumI", substrs=["first = 61453", "second = 51966"])
-        self.expect("expr iiumI", substrs=["first = 61453", "second = 51966"])
-
-        self.expect("frame variable siumI", substrs=['first = "hello"', "second = 137"])
-        self.expect("expr siumI", substrs=['first = "hello"', "second = 137"])
-
-        self.expect("frame variable iiumI.first", substrs=["first = 61453"])
-        self.expect("frame variable iiumI.first", substrs=["second"], matching=False)
-        self.expect("frame variable iiumI.second", substrs=["second = 51966"])
-        self.expect("frame variable iiumI.second", substrs=["first"], matching=False)
-
-        self.expect("frame variable siumI.first", substrs=['first = "hello"'])
-        self.expect("frame variable siumI.first", substrs=["second"], matching=False)
-        self.expect("frame variable siumI.second", substrs=["second = 137"])
-        self.expect("frame variable siumI.second", substrs=["first"], matching=False)
+    @add_test_categories(["libc++"])
+    def test_libcxx(self):
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test()
+
+    @add_test_categories(["libstdcxx"])
+    def test_libstdcxx(self):
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/main.cpp
similarity index 67%
rename from lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/main.cpp
rename to lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/main.cpp
index e53c0f167c325..8c11df075f247 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/iterator/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/iterator/main.cpp
@@ -5,16 +5,11 @@
 typedef std::map<int, int> intint_map;
 typedef std::map<std::string, int> strint_map;
 
-typedef std::unordered_map<int, int> intint_umap;
-typedef std::unordered_map<std::string, int> strint_umap;
-
 typedef std::vector<int> int_vector;
 typedef std::vector<std::string> string_vector;
 
 typedef intint_map::iterator ii_map_iter;
 typedef strint_map::iterator si_map_iter;
-typedef intint_umap::iterator ii_umap_iter;
-typedef strint_umap::iterator si_umap_iter;
 
 typedef int_vector::iterator ivter;
 typedef string_vector::iterator svter;
@@ -26,12 +21,6 @@ int main() {
   strint_map sim;
   sim["world"] = 42;
 
-  intint_umap iium;
-  iium[0xF00D] = 0xCAFE;
-
-  strint_umap sium;
-  sium["hello"] = 137;
-
   int_vector iv;
   iv.push_back(3);
 
@@ -40,8 +29,6 @@ int main() {
 
   ii_map_iter iimI = iim.begin();
   si_map_iter simI = sim.begin();
-  ii_umap_iter iiumI = iium.begin();
-  si_umap_iter siumI = sium.begin();
 
   ivter ivI = iv.begin();
   svter svI = sv.begin();
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
index 3d8569da0332e..d71fbf8d5f81a 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
@@ -118,3 +118,9 @@ def test_libcxx(self):
     def test_libstdcxx(self):
         self.build(dictionary={"USE_LIBSTDCPP": 1})
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/Makefile
deleted file mode 100644
index c825977b1a5dc..0000000000000
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-CXX_SOURCES := main.cpp
-
-CFLAGS_EXTRAS := -O0
-USE_LIBSTDCPP := 1
-
-include Makefile.rules
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/TestDataFormatterStdIterator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/TestDataFormatterStdIterator.py
deleted file mode 100644
index a0d34fb56f970..0000000000000
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/TestDataFormatterStdIterator.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-Test lldb data formatter subsystem.
-"""
-
-
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-
-
-class StdIteratorDataFormatterTestCase(TestBase):
-    def setUp(self):
-        # Call super's setUp().
-        TestBase.setUp(self)
-        # Find the line number to break at.
-        self.line = line_number("main.cpp", "// Set break point at this line.")
-
-    @add_test_categories(["libstdcxx"])
-    @expectedFailureAll(bugnumber="llvm.org/pr50861", compiler="gcc")
-    def test_with_run_command(self):
-        """Test that libstdcpp iterators format properly."""
-        self.build()
-        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
-
-        lldbutil.run_break_set_by_file_and_line(
-            self, "main.cpp", self.line, num_expected_locations=-1
-        )
-
-        self.runCmd("run", RUN_SUCCEEDED)
-
-        # The stop reason of the thread should be breakpoint.
-        self.expect(
-            "thread list",
-            STOPPED_DUE_TO_BREAKPOINT,
-            substrs=["stopped", "stop reason = breakpoint"],
-        )
-
-        # This is the function to remove the custom formats in order to have a
-        # clean slate for the next test case.
-        def cleanup():
-            self.runCmd("type format clear", check=False)
-            self.runCmd("type summary clear", check=False)
-            self.runCmd("type filter clear", check=False)
-            self.runCmd("type synth clear", check=False)
-
-        # Execute the cleanup function during test case tear down.
-        self.addTearDownHook(cleanup)
-
-        self.expect("frame variable ivI", substrs=["item = 3"])
-        self.expect("expr ivI", substrs=["item = 3"])
-
-        self.expect("frame variable iimI", substrs=["first = 0", "second = 12"])
-        self.expect("expr iimI", substrs=["first = 0", "second = 12"])
-
-        self.expect("frame variable simI", substrs=['first = "world"', "second = 42"])
-        self.expect("expr simI", substrs=['first = "world"', "second = 42"])
-
-        self.expect("frame variable svI", substrs=['item = "hello"'])
-        self.expect("expr svI", substrs=['item = "hello"'])
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/main.cpp
deleted file mode 100644
index 7ddffd19012e7..0000000000000
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/iterator/main.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <string>
-#include <map>
-#include <vector>
-
-typedef std::map<int, int> intint_map;
-typedef std::map<std::string, int> strint_map;
-
-typedef std::vector<int> int_vector;
-typedef std::vector<std::string> string_vector;
-
-typedef intint_map::iterator iimter;
-typedef strint_map::iterator simter;
-
-typedef int_vector::iterator ivter;
-typedef string_vector::iterator svter;
-
-int main()
-{
-    intint_map iim;
-	iim[0] = 12;
-	
-	strint_map sim;
-	sim["world"] = 42;
-    
-	int_vector iv;
-	iv.push_back(3);
-	
-	string_vector sv;
-	sv.push_back("hello");
-
-	iimter iimI = iim.begin();
-	simter simI = sim.begin();
-	
-	ivter ivI = iv.begin();
-	svter svI = sv.begin();
-
-    return 0; // Set break point at this line.
-}
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py b/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
index f4404d78492f9..de9900cae4b75 100644
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
@@ -24,6 +24,10 @@ def create_thread(self, tid, context):
         return None
 
     def get_thread_info(self):
+        if self.process.state != lldb.eStateStopped:
+            print("Error: get_thread_info called with state not stopped")
+            return []
+
         if not self.threads:
             self.threads = [
                 {
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/Makefile b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/Makefile
new file mode 100644
index 0000000000000..93618844a7a4d
--- /dev/null
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/Makefile
@@ -0,0 +1,4 @@
+C_SOURCES := main.c
+ENABLE_THREADS := YES
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/TestOSIndSYM.py b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/TestOSIndSYM.py
new file mode 100644
index 0000000000000..f0d192be661bb
--- /dev/null
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/TestOSIndSYM.py
@@ -0,0 +1,153 @@
+"""
+Test that an OS plugin in a dSYM sees the right process state
+when run from a dSYM on attach
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbsuite.test.lldbutil as lldbutil
+from lldbgdbserverutils import get_debugserver_exe
+
+import os
+import lldb
+import time
+import socket
+import shutil
+
+
+class TestOSPluginIndSYM(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # The port used by debugserver.
+    PORT = 54638
+
+    # The number of attempts.
+    ATTEMPTS = 10
+
+    # Time given to the binary to launch and to debugserver to attach to it for
+    # every attempt. We'll wait a maximum of 10 times 2 seconds while the
+    # inferior will wait 10 times 10 seconds.
+    TIMEOUT = 2
+
+    def no_debugserver(self):
+        if get_debugserver_exe() is None:
+            return "no debugserver"
+        return None
+
+    def port_not_available(self):
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        if s.connect_ex(("127.0.0.1", self.PORT)) == 0:
+            return "{} not available".format(self.PORT)
+        return None
+
+    @skipUnlessDarwin
+    def test_python_os_plugin(self):
+        self.do_test_python_os_plugin(False)
+
+    @skipTestIfFn(no_debugserver)
+    @skipTestIfFn(port_not_available)
+    def test_python_os_plugin_remote(self):
+        self.do_test_python_os_plugin(True)
+
+    def do_test_python_os_plugin(self, remote):
+        """Test that the environment for os plugins in dSYM's is correct"""
+        executable = self.build_dsym("my_binary")
+
+        # Make sure we're set up to load the symbol file's python
+        self.runCmd("settings set target.load-script-from-symbol-file true")
+
+        target = self.dbg.CreateTarget(None)
+
+        error = lldb.SBError()
+
+        # Now run the process, and then attach.  When the attach
+        # succeeds, make sure that we were in the right state when
+        # the OS plugins were run.
+        if not remote:
+            popen = self.spawnSubprocess(executable, [])
+
+            process = target.AttachToProcessWithID(lldb.SBListener(), popen.pid, error)
+            self.assertSuccess(error, "Attach succeeded")
+        else:
+            self.setup_remote_platform(executable)
+            process = target.process
+            self.assertTrue(process.IsValid(), "Got a valid process from debugserver")
+
+        # We should have figured out the target from the result of the attach:
+        self.assertTrue(target.IsValid, "Got a valid target")
+
+        # Make sure that we got the right plugin:
+        self.expect(
+            "settings show target.process.python-os-plugin-path",
+            substrs=["operating_system.py"],
+        )
+
+        for thread in process.threads:
+            stack_depth = thread.num_frames
+            reg_threads = thread.frames[0].reg
+
+        # OKAY, that realized the threads, now see if the creation
+        # state was correct.  The way we use the OS plugin, it doesn't need
+        # to create a thread, and doesn't have to call get_register_info,
+        # so we don't expect those to get called.
+        self.expect(
+            "test_report_command",
+            substrs=[
+                "in_init=1",
+                "in_get_thread_info=1",
+                "in_create_thread=2",
+                "in_get_register_info=2",
+                "in_get_register_data=1",
+            ],
+        )
+
+    def build_dsym(self, name):
+        self.build(debug_info="dsym", dictionary={"EXE": name})
+        executable = self.getBuildArtifact(name)
+        dsym_path = self.getBuildArtifact(name + ".dSYM")
+        python_dir_path = dsym_path
+        python_dir_path = os.path.join(dsym_path, "Contents", "Resources", "Python")
+        if not os.path.exists(python_dir_path):
+            os.mkdir(python_dir_path)
+        python_file_name = name + ".py"
+
+        os_plugin_dir = os.path.join(python_dir_path, "OS_Plugin")
+        if not os.path.exists(os_plugin_dir):
+            os.mkdir(os_plugin_dir)
+
+        plugin_dest_path = os.path.join(os_plugin_dir, "operating_system.py")
+        plugin_origin_path = os.path.join(self.getSourceDir(), "operating_system.py")
+        shutil.copy(plugin_origin_path, plugin_dest_path)
+
+        module_dest_path = os.path.join(python_dir_path, python_file_name)
+        with open(module_dest_path, "w") as f:
+            f.write("def __lldb_init_module(debugger, unused):\n")
+            f.write(
+                f"    debugger.HandleCommand(\"settings set target.process.python-os-plugin-path '{plugin_dest_path}'\")\n"
+            )
+            f.close()
+
+        return executable
+
+    def setup_remote_platform(self, exe):
+        # Get debugserver to start up our process for us, and then we
+        # can use `process connect` to attach to it.
+        debugserver = get_debugserver_exe()
+        debugserver_args = ["localhost:{}".format(self.PORT), exe]
+        self.spawnSubprocess(debugserver, debugserver_args)
+
+        # Select the platform.
+        self.runCmd("platform select remote-gdb-server")
+
+        # Connect to debugserver
+        interpreter = self.dbg.GetCommandInterpreter()
+        connected = False
+        for i in range(self.ATTEMPTS):
+            result = lldb.SBCommandReturnObject()
+            interpreter.HandleCommand(f"gdb-remote localhost:{self.PORT}", result)
+            connected = result.Succeeded()
+            if connected:
+                break
+            time.sleep(self.TIMEOUT)
+
+        self.assertTrue(connected, "could not connect to debugserver")
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/main.c b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/main.c
new file mode 100644
index 0000000000000..8e03f395e6110
--- /dev/null
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/main.c
@@ -0,0 +1,8 @@
+#include <unistd.h>
+
+int main() {
+  while (1) {
+    sleep(1);
+  }
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/operating_system.py b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/operating_system.py
new file mode 100644
index 0000000000000..0f9cec670b73f
--- /dev/null
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/os_plugin_in_dsym/operating_system.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import lldb
+import struct
+
+# Value is:
+#   0 called - state is not stopped
+#   1 called - state is stopped
+#   2 not called
+
+stop_state = {
+    "in_init": 2,
+    "in_get_thread_info": 2,
+    "in_create_thread": 2,
+    "in_get_register_info": 2,
+    "in_get_register_data": 2,
+}
+
+
+def ReportCommand(debugger, command, exe_ctx, result, unused):
+    global stop_state
+    for state in stop_state:
+        result.AppendMessage(f"{state}={stop_state[state]}\n")
+    result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
+
+
+class OperatingSystemPlugIn:
+    """This class checks that all the"""
+
+    def __init__(self, process):
+        """Initialization needs a valid.SBProcess object.
+        global stop_state
+
+        This plug-in will get created after a live process is valid and has stopped for the
+        first time."""
+        self.process = process
+        stop_state["in_init"] = self.state_is_stopped()
+        interp = process.target.debugger.GetCommandInterpreter()
+        result = lldb.SBCommandReturnObject()
+        cmd_str = (
+            f"command script add test_report_command -o -f {__name__}.ReportCommand"
+        )
+        interp.HandleCommand(cmd_str, result)
+
+    def state_is_stopped(self):
+        if self.process.state == lldb.eStateStopped:
+            return 1
+        else:
+            return 0
+
+    def does_plugin_report_all_threads(self):
+        return True
+
+    def create_thread(self, tid, context):
+        global stop_state
+        stop_state["in_create_thread"] = self.state_is_stopped()
+
+        return None
+
+    def get_thread_info(self):
+        global stop_state
+        stop_state["in_get_thread_info"] = self.state_is_stopped()
+        idx = self.process.threads[0].idx
+        return [
+            {
+                "tid": 0x111111111,
+                "name": "one",
+                "queue": "queue1",
+                "state": "stopped",
+                "stop_reason": "breakpoint",
+                "core": idx,
+            }
+        ]
+
+    def get_register_info(self):
+        global stop_state
+        stop_state["in_get_register_info"] = self.state_is_stopped()
+        return None
+
+    def get_register_data(self, tid):
+        global stop_state
+        stop_state["in_get_register_data"] = self.state_is_stopped()
+        return None
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index ae8142ae4f484..a611cc30c1897 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -44,22 +44,39 @@ def test_failing_launch_program(self):
             "'{0}' does not exist".format(program), response["body"]["error"]["format"]
         )
 
-    def test_failing_launch_commands_and_run_in_terminal(self):
+    def test_failing_launch_commands_and_console(self):
         """
-        Tests launching with an invalid program.
+        Tests launching with launch commands in an integrated terminal.
         """
         program = self.getBuildArtifact("a.out")
         self.create_debug_adapter()
         response = self.launch(
-            program, launchCommands=["a b c"], runInTerminal=True, expectFailure=True
+            program,
+            launchCommands=["a b c"],
+            console="integratedTerminal",
+            expectFailure=True,
         )
         self.assertFalse(response["success"])
         self.assertTrue(self.get_dict_value(response, ["body", "error", "showUser"]))
         self.assertEqual(
-            "'launchCommands' and 'runInTerminal' are mutually exclusive",
+            "'launchCommands' and non-internal 'console' are mutually exclusive",
             self.get_dict_value(response, ["body", "error", "format"]),
         )
 
+    def test_failing_console(self):
+        """
+        Tests launching in console with an invalid terminal type.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.create_debug_adapter()
+        response = self.launch(program, console="invalid", expectFailure=True)
+        self.assertFalse(response["success"])
+        self.assertTrue(self.get_dict_value(response, ["body", "error", "showUser"]))
+        self.assertRegex(
+            response["body"]["error"]["format"],
+            r"unexpected value, expected 'internalConsole\', 'integratedTerminal\' or 'externalTerminal\' at arguments.console",
+        )
+
     @skipIfWindows
     def test_termination(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index 9cfa9b20f6051..3b769d2dd89ce 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -28,7 +28,7 @@ def test_stack_frame_name(self):
         parent_frame = self.dap_server.get_stackFrame(frameIndex=1)
         self.assertTrue(parent_frame["name"].endswith(" [opt]"))
 
-    @skipIfAsan # On ASAN builds this test intermittently fails https://github.com/llvm/llvm-project/issues/111061
+    @skipIfAsan  # On ASAN builds this test intermittently fails https://github.com/llvm/llvm-project/issues/111061
     @skipIfWindows
     def test_optimized_variable(self):
         """Test optimized variable value contains error."""
@@ -50,9 +50,8 @@ def test_optimized_variable(self):
             value.startswith("<error:"),
             f"expect error for value: '{value}'",
         )
-        error_msg = optimized_variable["$__lldb_extensions"]["error"]
         self.assertTrue(
-            ("could not evaluate DW_OP_entry_value: no parent function" in error_msg)
-            or ("variable not available" in error_msg)
+            ("could not evaluate DW_OP_entry_value: no parent function" in value)
+            or ("variable not available" in value)
         )
         self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
similarity index 92%
rename from lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
rename to lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index 35810feb48366..67483798f2265 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -10,7 +10,7 @@
 
 
 @skipIfBuildType(["debug"])
-class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
+class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase):
     def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
         seen_stopped_event = 0
         for stopped_event in stopped_events:
@@ -44,7 +44,7 @@ def test_basic_functionality(self):
         line_B = line_number("main.c", "// breakpoint B")
 
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, runInTerminal=True)
+        self.build_and_launch(program, console="integratedTerminal")
         [bp_A, bp_B] = self.set_source_breakpoints("main.c", [line_A, line_B])
 
         # Verify we hit A, then B.
@@ -86,10 +86,10 @@ def test_basic_functionality(self):
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
     def test_stopOnEntry(self):
         """
-        Check that stopOnEntry works correctly when using runInTerminal.
+        Check that stopOnEntry works correctly when using console.
         """
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, runInTerminal=True, stopOnEntry=True)
+        self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
         self.dap_server.request_continue()  # sends configuration done
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index af8b6b140da47..4a360a0bcad01 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -37,7 +37,7 @@ def test_runInTerminal(self):
         program = self.getBuildArtifact("a.out")
         source = "main.c"
         self.build_and_launch(
-            program, runInTerminal=True, args=["foobar"], env=["FOO=bar"]
+            program, console="integratedTerminal", args=["foobar"], env=["FOO=bar"]
         )
 
         self.assertEqual(
@@ -83,7 +83,7 @@ def test_runInTerminalWithObjectEnv(self):
             launch the inferior with the correct environment variables using an object.
         """
         program = self.getBuildArtifact("a.out")
-        self.build_and_launch(program, runInTerminal=True, env={"FOO": "BAR"})
+        self.build_and_launch(program, console="integratedTerminal", env={"FOO": "BAR"})
 
         self.assertEqual(
             len(self.dap_server.reverse_requests),
@@ -105,7 +105,7 @@ def test_runInTerminalInvalidTarget(self):
         self.build_and_create_debug_adapter()
         response = self.launch(
             "INVALIDPROGRAM",
-            runInTerminal=True,
+            console="integratedTerminal",
             args=["foobar"],
             env=["FOO=bar"],
             expectFailure=True,
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 340be0b39010d..69d4eab0b6fcd 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -1,12 +1,10 @@
 """
-Test lldb-dap setBreakpoints request
+Test lldb-dap variables request
 """
 
 import os
 
-import dap_server
 import lldbdap_testcase
-from lldbsuite.test import lldbutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 
@@ -168,15 +166,6 @@ def do_test_scopes_variables_setVariable_evaluate(
                     "type": "int",
                     "value": "1",
                 },
-                "$__lldb_extensions": {
-                    "equals": {
-                        "value": "1",
-                    },
-                    "declaration": {
-                        "equals": {"line": 12, "column": 14},
-                        "contains": {"path": ["lldb-dap", "variables", "main.cpp"]},
-                    },
-                },
             },
             "argv": {
                 "equals": {"type": "const char **"},
@@ -196,10 +185,6 @@ def do_test_scopes_variables_setVariable_evaluate(
             },
             "x": {"equals": {"type": "int"}},
         }
-        if enableAutoVariableSummaries:
-            verify_locals["pt"]["$__lldb_extensions"] = {
-                "equals": {"autoSummary": "{x:11, y:22}"}
-            }
 
         verify_globals = {
             "s_local": {"equals": {"type": "float", "value": "2.25"}},
diff --git a/lldb/test/Shell/Breakpoint/condition-lang.test b/lldb/test/Shell/Breakpoint/condition-lang.test
new file mode 100644
index 0000000000000..9a64bf4ff610f
--- /dev/null
+++ b/lldb/test/Shell/Breakpoint/condition-lang.test
@@ -0,0 +1,5 @@
+RUN: not %lldb -b -o 'break set -n foo -c bar -Y bogus' 2>&1 | FileCheck %s --check-prefix INVALID
+INVALID: error: Invalid value ('bogus') for -Y (condition-language): invalid language
+
+RUN: not %lldb -b -o 'break set -n foo -c bar -Y python' 2>&1 | FileCheck %s --check-prefix NOEXPRSUPPORT
+NOEXPRSUPPORT: error: Invalid value ('python') for -Y (condition-language): no expression support for language
diff --git a/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp
index 1d7b4b7009462..553cbeaf849e2 100644
--- a/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp
@@ -23,9 +23,10 @@ namespace lldb_dap {
 /// Launch request; value of command field is 'launch'.
 Error LaunchRequestHandler::Run(const LaunchRequestArguments &arguments) const {
   // Validate that we have a well formed launch request.
-  if (!arguments.launchCommands.empty() && arguments.runInTerminal)
+  if (!arguments.launchCommands.empty() &&
+      arguments.console != protocol::eConsoleInternal)
     return make_error<DAPError>(
-        "'launchCommands' and 'runInTerminal' are mutually exclusive");
+        "'launchCommands' and non-internal 'console' are mutually exclusive");
 
   dap.SetConfiguration(arguments.configuration, /*is_attach=*/false);
   dap.last_launch_request = arguments;
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index 93bc80a38e29d..4fadf1c22e0e3 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -80,7 +80,8 @@ RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) {
 
   llvm::json::Object reverse_request = CreateRunInTerminalReverseRequest(
       arguments.configuration.program, arguments.args, arguments.env,
-      arguments.cwd, comm_file.m_path, debugger_pid);
+      arguments.cwd, comm_file.m_path, debugger_pid,
+      arguments.console == protocol::eConsoleExternalTerminal);
   dap.SendReverseRequest<LogFailureResponseHandler>("runInTerminal",
                                                     std::move(reverse_request));
 
@@ -192,7 +193,7 @@ llvm::Error BaseRequestHandler::LaunchProcess(
     // about process state changes during the launch.
     ScopeSyncMode scope_sync_mode(dap.debugger);
 
-    if (arguments.runInTerminal) {
+    if (arguments.console != protocol::eConsoleInternal) {
       if (llvm::Error err = RunInTerminal(dap, arguments))
         return err;
     } else if (launchCommands.empty()) {
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index 07b079d19896d..16f8062f97d7b 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -540,11 +540,14 @@ class ThreadsRequestHandler
   Run(const protocol::ThreadsArguments &) const override;
 };
 
-class VariablesRequestHandler : public LegacyRequestHandler {
+class VariablesRequestHandler
+    : public RequestHandler<protocol::VariablesArguments,
+                            llvm::Expected<protocol::VariablesResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "variables"; }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::VariablesResponseBody>
+  Run(const protocol::VariablesArguments &) const override;
 };
 
 class LocationsRequestHandler : public LegacyRequestHandler {
diff --git a/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp
index 19bcca2b22b9b..5fa2b1ef5e20d 100644
--- a/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp
@@ -8,107 +8,37 @@
 
 #include "DAP.h"
 #include "EventHelper.h"
+#include "Handler/RequestHandler.h"
 #include "JSONUtils.h"
-#include "RequestHandler.h"
+#include "ProtocolUtils.h"
+
+using namespace llvm;
+using namespace lldb_dap::protocol;
 
 namespace lldb_dap {
 
-// "VariablesRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "Variables request; value of command field is 'variables'.
-//     Retrieves all child variables for the given variable reference. An
-//     optional filter can be used to limit the fetched children to either named
-//     or indexed children.", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "variables" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/VariablesArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "VariablesArguments": {
-//   "type": "object",
-//   "description": "Arguments for 'variables' request.",
-//   "properties": {
-//     "variablesReference": {
-//       "type": "integer",
-//       "description": "The Variable reference."
-//     },
-//     "filter": {
-//       "type": "string",
-//       "enum": [ "indexed", "named" ],
-//       "description": "Optional filter to limit the child variables to either
-//       named or indexed. If ommited, both types are fetched."
-//     },
-//     "start": {
-//       "type": "integer",
-//       "description": "The index of the first variable to return; if omitted
-//       children start at 0."
-//     },
-//     "count": {
-//       "type": "integer",
-//       "description": "The number of variables to return. If count is missing
-//       or 0, all variables are returned."
-//     },
-//     "format": {
-//       "$ref": "#/definitions/ValueFormat",
-//       "description": "Specifies details on how to format the Variable
-//       values."
-//     }
-//   },
-//   "required": [ "variablesReference" ]
-// },
-// "VariablesResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to 'variables' request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "variables": {
-//             "type": "array",
-//             "items": {
-//               "$ref": "#/definitions/Variable"
-//             },
-//             "description": "All (or a range) of variables for the given
-//             variable reference."
-//           }
-//         },
-//         "required": [ "variables" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-void VariablesRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  llvm::json::Array variables;
-  const auto *arguments = request.getObject("arguments");
-  const auto variablesReference =
-      GetInteger<uint64_t>(arguments, "variablesReference").value_or(0);
-  const auto start = GetInteger<int64_t>(arguments, "start").value_or(0);
-  const auto count = GetInteger<int64_t>(arguments, "count").value_or(0);
+/// Retrieves all child variables for the given variable reference.
+///
+/// A filter can be used to limit the fetched children to either named or
+/// indexed children.
+Expected<VariablesResponseBody>
+VariablesRequestHandler::Run(const VariablesArguments &arguments) const {
+  const uint64_t var_ref = arguments.variablesReference;
+  const uint64_t count = arguments.count;
+  const uint64_t start = arguments.start;
   bool hex = false;
-  const auto *format = arguments->getObject("format");
-  if (format)
-    hex = GetBoolean(format, "hex").value_or(false);
+  if (arguments.format)
+    hex = arguments.format->hex;
+
+  std::vector<Variable> variables;
 
-  if (lldb::SBValueList *top_scope =
-          dap.variables.GetTopLevelScope(variablesReference)) {
+  if (lldb::SBValueList *top_scope = dap.variables.GetTopLevelScope(var_ref)) {
     // variablesReference is one of our scopes, not an actual variable it is
     // asking for the list of args, locals or globals.
     int64_t start_idx = 0;
     int64_t num_children = 0;
 
-    if (variablesReference == VARREF_REGS) {
+    if (var_ref == VARREF_REGS) {
       // Change the default format of any pointer sized registers in the first
       // register set to be the lldb::eFormatAddressInfo so we show the pointer
       // and resolve what the pointer resolves to. Only change the format if the
@@ -128,7 +58,7 @@ void VariablesRequestHandler::operator()(
     }
 
     num_children = top_scope->GetSize();
-    if (num_children == 0 && variablesReference == VARREF_LOCALS) {
+    if (num_children == 0 && var_ref == VARREF_LOCALS) {
       // Check for an error in the SBValueList that might explain why we don't
       // have locals. If we have an error display it as the sole value in the
       // the locals.
@@ -145,12 +75,11 @@ void VariablesRequestHandler::operator()(
         // errors are only set when there is a problem that the user could
         // fix, so no error will show up when you have no debug info, only when
         // we do have debug info and something that is fixable can be done.
-        llvm::json::Object object;
-        EmplaceSafeString(object, "name", "<error>");
-        EmplaceSafeString(object, "type", "const char *");
-        EmplaceSafeString(object, "value", var_err);
-        object.try_emplace("variablesReference", (int64_t)0);
-        variables.emplace_back(std::move(object));
+        Variable var;
+        var.name = "<error>";
+        var.type = "const char *";
+        var.value = var_err;
+        variables.emplace_back(var);
       }
     }
     const int64_t end_idx = start_idx + ((count == 0) ? num_children : count);
@@ -165,7 +94,7 @@ void VariablesRequestHandler::operator()(
     }
 
     // Show return value if there is any ( in the local top frame )
-    if (variablesReference == VARREF_LOCALS) {
+    if (var_ref == VARREF_LOCALS) {
       auto process = dap.target.GetProcess();
       auto selected_thread = process.GetSelectedThread();
       lldb::SBValue stop_return_value = selected_thread.GetStopReturnValue();
@@ -194,32 +123,35 @@ void VariablesRequestHandler::operator()(
       if (!variable.IsValid())
         break;
 
-      int64_t var_ref =
+      const int64_t frame_var_ref =
           dap.variables.InsertVariable(variable, /*is_permanent=*/false);
       variables.emplace_back(CreateVariable(
-          variable, var_ref, hex, dap.configuration.enableAutoVariableSummaries,
+          variable, frame_var_ref, hex,
+          dap.configuration.enableAutoVariableSummaries,
           dap.configuration.enableSyntheticChildDebugging,
           variable_name_counts[GetNonNullVariableName(variable)] > 1));
     }
   } else {
     // We are expanding a variable that has children, so we will return its
     // children.
-    lldb::SBValue variable = dap.variables.GetVariable(variablesReference);
+    lldb::SBValue variable = dap.variables.GetVariable(var_ref);
     if (variable.IsValid()) {
+      const bool is_permanent =
+          dap.variables.IsPermanentVariableReference(var_ref);
       auto addChild = [&](lldb::SBValue child,
                           std::optional<std::string> custom_name = {}) {
         if (!child.IsValid())
           return;
-        bool is_permanent =
-            dap.variables.IsPermanentVariableReference(variablesReference);
-        int64_t var_ref = dap.variables.InsertVariable(child, is_permanent);
-        variables.emplace_back(CreateVariable(
-            child, var_ref, hex, dap.configuration.enableAutoVariableSummaries,
-            dap.configuration.enableSyntheticChildDebugging,
-            /*is_name_duplicated=*/false, custom_name));
+        const int64_t child_var_ref =
+            dap.variables.InsertVariable(child, is_permanent);
+        variables.emplace_back(
+            CreateVariable(child, child_var_ref, hex,
+                           dap.configuration.enableAutoVariableSummaries,
+                           dap.configuration.enableSyntheticChildDebugging,
+                           /*is_name_duplicated=*/false, custom_name));
       };
       const int64_t num_children = variable.GetNumChildren();
-      int64_t end_idx = start + ((count == 0) ? num_children : count);
+      const int64_t end_idx = start + ((count == 0) ? num_children : count);
       int64_t i = start;
       for (; i < end_idx && i < num_children; ++i)
         addChild(variable.GetChildAtIndex(i));
@@ -233,10 +165,8 @@ void VariablesRequestHandler::operator()(
         addChild(variable.GetNonSyntheticValue(), "[raw]");
     }
   }
-  llvm::json::Object body;
-  body.try_emplace("variables", std::move(variables));
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  return VariablesResponseBody{variables};
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 553c52605c998..41ca29a405ac9 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -120,6 +120,42 @@ DecodeMemoryReference(llvm::StringRef memoryReference) {
   return addr;
 }
 
+bool DecodeMemoryReference(const llvm::json::Value &v, llvm::StringLiteral key,
+                           lldb::addr_t &out, llvm::json::Path path,
+                           bool required) {
+  const llvm::json::Object *v_obj = v.getAsObject();
+  if (!v_obj) {
+    path.report("expected object");
+    return false;
+  }
+
+  const llvm::json::Value *mem_ref_value = v_obj->get(key);
+  if (!mem_ref_value) {
+    if (!required)
+      return true;
+
+    path.field(key).report("missing value");
+    return false;
+  }
+
+  const std::optional<llvm::StringRef> mem_ref_str =
+      mem_ref_value->getAsString();
+  if (!mem_ref_str) {
+    path.field(key).report("expected string");
+    return false;
+  }
+
+  const std::optional<lldb::addr_t> addr_opt =
+      DecodeMemoryReference(*mem_ref_str);
+  if (!addr_opt) {
+    path.field(key).report("malformed memory reference");
+    return false;
+  }
+
+  out = *addr_opt;
+  return true;
+}
+
 std::vector<std::string> GetStrings(const llvm::json::Object *obj,
                                     llvm::StringRef key) {
   std::vector<std::string> strs;
@@ -768,38 +804,6 @@ VariableDescription::VariableDescription(lldb::SBValue v,
   evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
 }
 
-llvm::json::Object VariableDescription::GetVariableExtensionsJSON() {
-  llvm::json::Object extensions;
-  if (error)
-    EmplaceSafeString(extensions, "error", *error);
-  if (!value.empty())
-    EmplaceSafeString(extensions, "value", value);
-  if (!summary.empty())
-    EmplaceSafeString(extensions, "summary", summary);
-  if (auto_summary)
-    EmplaceSafeString(extensions, "autoSummary", *auto_summary);
-
-  if (lldb::SBDeclaration decl = v.GetDeclaration(); decl.IsValid()) {
-    llvm::json::Object decl_obj;
-    if (lldb::SBFileSpec file = decl.GetFileSpec(); file.IsValid()) {
-      char path[PATH_MAX] = "";
-      if (file.GetPath(path, sizeof(path)) &&
-          lldb::SBFileSpec::ResolvePath(path, path, PATH_MAX)) {
-        decl_obj.try_emplace("path", std::string(path));
-      }
-    }
-
-    if (int line = decl.GetLine())
-      decl_obj.try_emplace("line", line);
-    if (int column = decl.GetColumn())
-      decl_obj.try_emplace("column", column);
-
-    if (!decl_obj.empty())
-      extensions.try_emplace("declaration", std::move(decl_obj));
-  }
-  return extensions;
-}
-
 std::string VariableDescription::GetResult(llvm::StringRef context) {
   // In repl context, the results can be displayed as multiple lines so more
   // detailed descriptions can be returned.
@@ -836,226 +840,6 @@ std::pair<int64_t, bool> UnpackLocation(int64_t location_id) {
   return std::pair{location_id >> 1, location_id & 1};
 }
 
-// "Variable": {
-//   "type": "object",
-//   "description": "A Variable is a name/value pair. Optionally a variable
-//                   can have a 'type' that is shown if space permits or when
-//                   hovering over the variable's name. An optional 'kind' is
-//                   used to render additional properties of the variable,
-//                   e.g. different icons can be used to indicate that a
-//                   variable is public or private. If the value is
-//                   structured (has children), a handle is provided to
-//                   retrieve the children with the VariablesRequest. If
-//                   the number of named or indexed children is large, the
-//                   numbers should be returned via the optional
-//                   'namedVariables' and 'indexedVariables' attributes. The
-//                   client can use this optional information to present the
-//                   children in a paged UI and fetch them in chunks.",
-//   "properties": {
-//     "name": {
-//       "type": "string",
-//       "description": "The variable's name."
-//     },
-//     "value": {
-//       "type": "string",
-//       "description": "The variable's value. This can be a multi-line text,
-//                       e.g. for a function the body of a function."
-//     },
-//     "type": {
-//       "type": "string",
-//       "description": "The type of the variable's value. Typically shown in
-//                       the UI when hovering over the value."
-//     },
-//     "presentationHint": {
-//       "$ref": "#/definitions/VariablePresentationHint",
-//       "description": "Properties of a variable that can be used to determine
-//                       how to render the variable in the UI."
-//     },
-//     "evaluateName": {
-//       "type": "string",
-//       "description": "Optional evaluatable name of this variable which can
-//                       be passed to the 'EvaluateRequest' to fetch the
-//                       variable's value."
-//     },
-//     "variablesReference": {
-//       "type": "integer",
-//       "description": "If variablesReference is > 0, the variable is
-//                       structured and its children can be retrieved by
-//                       passing variablesReference to the VariablesRequest."
-//     },
-//     "namedVariables": {
-//       "type": "integer",
-//       "description": "The number of named child variables. The client can
-//                       use this optional information to present the children
-//                       in a paged UI and fetch them in chunks."
-//     },
-//     "indexedVariables": {
-//       "type": "integer",
-//       "description": "The number of indexed child variables. The client
-//                       can use this optional information to present the
-//                       children in a paged UI and fetch them in chunks."
-//     },
-//     "memoryReference": {
-//        "type": "string",
-//        "description": "A memory reference associated with this variable.
-//                        For pointer type variables, this is generally a
-//                        reference to the memory address contained in the
-//                        pointer. For executable data, this reference may later
-//                        be used in a `disassemble` request. This attribute may
-//                        be returned by a debug adapter if corresponding
-//                        capability `supportsMemoryReferences` is true."
-//     },
-//     "declarationLocationReference": {
-//       "type": "integer",
-//       "description": "A reference that allows the client to request the
-//                       location where the variable is declared. This should be
-//                       present only if the adapter is likely to be able to
-//                       resolve the location.\n\nThis reference shares the same
-//                       lifetime as the `variablesReference`. See 'Lifetime of
-//                       Object References' in the Overview section for
-//                       details."
-//     },
-//     "valueLocationReference": {
-//       "type": "integer",
-//       "description": "A reference that allows the client to request the
-//                       location where the variable's value is declared. For
-//                       example, if the variable contains a function pointer,
-//                       the adapter may be able to look up the function's
-//                       location. This should be present only if the adapter
-//                       is likely to be able to resolve the location.\n\nThis
-//                       reference shares the same lifetime as the
-//                       `variablesReference`. See 'Lifetime of Object
-//                       References' in the Overview section for details."
-//     },
-//
-//     "$__lldb_extensions": {
-//       "description": "Unofficial extensions to the protocol",
-//       "properties": {
-//         "declaration": {
-//           "type": "object",
-//           "description": "The source location where the variable was
-//                           declared. This value won't be present if no
-//                           declaration is available.
-//                           Superseded by `declarationLocationReference`",
-//           "properties": {
-//             "path": {
-//               "type": "string",
-//               "description": "The source file path where the variable was
-//                              declared."
-//             },
-//             "line": {
-//               "type": "number",
-//               "description": "The 1-indexed source line where the variable
-//                               was declared."
-//             },
-//             "column": {
-//               "type": "number",
-//               "description": "The 1-indexed source column where the variable
-//                               was declared."
-//             }
-//           }
-//         },
-//         "value": {
-//           "type": "string",
-//           "description": "The internal value of the variable as returned by
-//                            This is effectively SBValue.GetValue(). The other
-//                            `value` entry in the top-level variable response
-//                            is, on the other hand, just a display string for
-//                            the variable."
-//         },
-//         "summary": {
-//           "type": "string",
-//           "description": "The summary string of the variable. This is
-//                           effectively SBValue.GetSummary()."
-//         },
-//         "autoSummary": {
-//           "type": "string",
-//           "description": "The auto generated summary if using
-//                           `enableAutoVariableSummaries`."
-//         },
-//         "error": {
-//           "type": "string",
-//           "description": "An error message generated if LLDB couldn't inspect
-//                           the variable."
-//         }
-//       }
-//     }
-//   },
-//   "required": [ "name", "value", "variablesReference" ]
-// }
-llvm::json::Value CreateVariable(lldb::SBValue v, int64_t var_ref,
-                                 bool format_hex, bool auto_variable_summaries,
-                                 bool synthetic_child_debugging,
-                                 bool is_name_duplicated,
-                                 std::optional<std::string> custom_name) {
-  VariableDescription desc(v, auto_variable_summaries, format_hex,
-                           is_name_duplicated, custom_name);
-  llvm::json::Object object;
-  EmplaceSafeString(object, "name", desc.name);
-  EmplaceSafeString(object, "value", desc.display_value);
-
-  if (!desc.evaluate_name.empty())
-    EmplaceSafeString(object, "evaluateName", desc.evaluate_name);
-
-  // If we have a type with many children, we would like to be able to
-  // give a hint to the IDE that the type has indexed children so that the
-  // request can be broken up in grabbing only a few children at a time. We
-  // want to be careful and only call "v.GetNumChildren()" if we have an array
-  // type or if we have a synthetic child provider producing indexed children.
-  // We don't want to call "v.GetNumChildren()" on all objects as class, struct
-  // and union types don't need to be completed if they are never expanded. So
-  // we want to avoid calling this to only cases where we it makes sense to keep
-  // performance high during normal debugging.
-
-  // If we have an array type, say that it is indexed and provide the number
-  // of children in case we have a huge array. If we don't do this, then we
-  // might take a while to produce all children at onces which can delay your
-  // debug session.
-  if (desc.type_obj.IsArrayType()) {
-    object.try_emplace("indexedVariables", v.GetNumChildren());
-  } else if (v.IsSynthetic()) {
-    // For a type with a synthetic child provider, the SBType of "v" won't tell
-    // us anything about what might be displayed. Instead, we check if the first
-    // child's name is "[0]" and then say it is indexed. We call
-    // GetNumChildren() only if the child name matches to avoid a potentially
-    // expensive operation.
-    if (lldb::SBValue first_child = v.GetChildAtIndex(0)) {
-      llvm::StringRef first_child_name = first_child.GetName();
-      if (first_child_name == "[0]") {
-        size_t num_children = v.GetNumChildren();
-        // If we are creating a "[raw]" fake child for each synthetic type, we
-        // have to account for it when returning indexed variables.
-        if (synthetic_child_debugging)
-          ++num_children;
-        object.try_emplace("indexedVariables", num_children);
-      }
-    }
-  }
-  EmplaceSafeString(object, "type", desc.display_type_name);
-
-  // A unique variable identifier to help in properly identifying variables with
-  // the same name. This is an extension to the VS protocol.
-  object.try_emplace("id", var_ref);
-
-  if (v.MightHaveChildren())
-    object.try_emplace("variablesReference", var_ref);
-  else
-    object.try_emplace("variablesReference", 0);
-
-  if (v.GetDeclaration().IsValid())
-    object.try_emplace("declarationLocationReference",
-                       PackLocation(var_ref, false));
-
-  if (ValuePointsToCode(v))
-    object.try_emplace("valueLocationReference", PackLocation(var_ref, true));
-
-  if (lldb::addr_t addr = v.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
-    object.try_emplace("memoryReference", EncodeMemoryReference(addr));
-
-  object.try_emplace("$__lldb_extensions", desc.GetVariableExtensionsJSON());
-  return llvm::json::Value(std::move(object));
-}
-
 llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit) {
   llvm::json::Object object;
   char unit_path_arr[PATH_MAX];
@@ -1070,12 +854,16 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit) {
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid) {
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external) {
   llvm::json::Object run_in_terminal_args;
-  // This indicates the IDE to open an embedded terminal, instead of opening
-  // the terminal in a new window.
-  run_in_terminal_args.try_emplace("kind", "integrated");
-
+  if (external) {
+    // This indicates the IDE to open an external terminal window.
+    run_in_terminal_args.try_emplace("kind", "external");
+  } else {
+    // This indicates the IDE to open an embedded terminal, instead of opening
+    // the terminal in a new window.
+    run_in_terminal_args.try_emplace("kind", "integrated");
+  }
   // The program path must be the first entry in the "args" field
   std::vector<std::string> req_args = {DAP::debug_adapter_path.str(),
                                        "--comm-file", comm_file.str()};
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 0424438ad5b72..e9094f67b94ec 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -138,6 +138,30 @@ std::string EncodeMemoryReference(lldb::addr_t addr);
 std::optional<lldb::addr_t>
 DecodeMemoryReference(llvm::StringRef memoryReference);
 
+/// Decodes a memory reference from the given json value.
+///
+/// \param[in] v
+///    A JSON value that we expected to contain the memory reference.
+///
+/// \param[in] key
+///    The key of the memory reference.
+///
+/// \param[out] out
+///    The memory address, if successfully decoded.
+///
+/// \param[in] path
+///    The path for reporting errors.
+///
+/// \param[in] required
+///    Indicates if the key is required to be present, otherwise report an error
+///    if the key is missing.
+///
+/// \return
+///    Returns \b true if the address was decoded successfully.
+bool DecodeMemoryReference(const llvm::json::Value &v, llvm::StringLiteral key,
+                           lldb::addr_t &out, llvm::json::Path path,
+                           bool required);
+
 /// Extract an array of strings for the specified key from an object.
 ///
 /// String values in the array will be extracted without any quotes
@@ -326,10 +350,6 @@ struct VariableDescription {
                       bool format_hex = false, bool is_name_duplicated = false,
                       std::optional<std::string> custom_name = {});
 
-  /// Create a JSON object that represents these extensions to the DAP variable
-  /// response.
-  llvm::json::Object GetVariableExtensionsJSON();
-
   /// Returns a description of the value appropriate for the specified context.
   std::string GetResult(llvm::StringRef context);
 };
@@ -344,61 +364,6 @@ int64_t PackLocation(int64_t var_ref, bool is_value_location);
 /// Reverse of `PackLocation`
 std::pair<int64_t, bool> UnpackLocation(int64_t location_id);
 
-/// Create a "Variable" object for a LLDB thread object.
-///
-/// This function will fill in the following keys in the returned
-/// object:
-///   "name" - the name of the variable
-///   "value" - the value of the variable as a string
-///   "type" - the typename of the variable as a string
-///   "id" - a unique identifier for a value in case there are multiple
-///          variables with the same name. Other parts of the DAP
-///          protocol refer to values by name so this can help
-///          disambiguate such cases if a IDE passes this "id" value
-///          back down.
-///   "variablesReference" - Zero if the variable has no children,
-///          non-zero integer otherwise which can be used to expand
-///          the variable.
-///   "evaluateName" - The name of the variable to use in expressions
-///                    as a string.
-///
-/// \param[in] v
-///     The LLDB value to use when populating out the "Variable"
-///     object.
-///
-/// \param[in] var_ref
-///     The variable reference. Used to identify the value, e.g.
-///     in the `variablesReference` or `declarationLocationReference`
-///     properties.
-///
-/// \param[in] format_hex
-///     If set to true the variable will be formatted as hex in
-///     the "value" key value pair for the value of the variable.
-///
-/// \param[in] auto_variable_summaries
-///     IF set to true the variable will create an automatic variable summary.
-///
-/// \param[in] is_name_duplicated
-///     Whether the same variable name appears multiple times within the same
-///     context (e.g. locals). This can happen due to shadowed variables in
-///     nested blocks.
-///
-///     As VSCode doesn't render two of more variables with the same name, we
-///     apply a suffix to distinguish duplicated variables.
-///
-/// \param[in] custom_name
-///     A provided custom name that is used instead of the SBValue's when
-///     creating the JSON representation.
-///
-/// \return
-///     A "Variable" JSON object with that follows the formal JSON
-///     definition outlined by Microsoft.
-llvm::json::Value CreateVariable(lldb::SBValue v, int64_t var_ref,
-                                 bool format_hex, bool auto_variable_summaries,
-                                 bool synthetic_child_debugging,
-                                 bool is_name_duplicated = false,
-                                 std::optional<std::string> custom_name = {});
-
 llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 
 /// Create a runInTerminal reverse request object
@@ -423,13 +388,17 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 ///     launcher uses it on Linux tell the kernel that it should allow the
 ///     debugger process to attach.
 ///
+/// \param[in] external
+///     If set to true, the program will run in an external terminal window
+///     instead of IDE's integrated terminal.
+///
 /// \return
 ///     A "runInTerminal" JSON object that follows the specification outlined by
 ///     Microsoft.
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid);
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external);
 
 /// Create a "Terminated" JSON object that contains statistics
 ///
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 83a205f118fc0..d9d688b4c41fe 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -262,6 +262,34 @@ json::Value toJSON(const BreakpointLocationsResponseBody &BLRB) {
   return json::Object{{"breakpoints", BLRB.breakpoints}};
 }
 
+bool fromJSON(const json::Value &Params, Console &C, json::Path P) {
+  auto oldFormatConsole = Params.getAsBoolean();
+  if (oldFormatConsole) {
+    C = *oldFormatConsole ? eConsoleIntegratedTerminal : eConsoleInternal;
+    return true;
+  }
+  auto newFormatConsole = Params.getAsString();
+  if (!newFormatConsole) {
+    P.report("expected a string");
+    return false;
+  }
+
+  std::optional<Console> console =
+      StringSwitch<std::optional<Console>>(*newFormatConsole)
+          .Case("internalConsole", eConsoleInternal)
+          .Case("integratedTerminal", eConsoleIntegratedTerminal)
+          .Case("externalTerminal", eConsoleExternalTerminal)
+          .Default(std::nullopt);
+  if (!console) {
+    P.report("unexpected value, expected 'internalConsole', "
+             "'integratedTerminal' or 'externalTerminal'");
+    return false;
+  }
+
+  C = *console;
+  return true;
+}
+
 bool fromJSON(const json::Value &Params, LaunchRequestArguments &LRA,
               json::Path P) {
   json::ObjectMapper O(Params, P);
@@ -273,9 +301,8 @@ bool fromJSON(const json::Value &Params, LaunchRequestArguments &LRA,
          O.mapOptional("disableASLR", LRA.disableASLR) &&
          O.mapOptional("disableSTDIO", LRA.disableSTDIO) &&
          O.mapOptional("shellExpandArguments", LRA.shellExpandArguments) &&
-
-         O.mapOptional("runInTerminal", LRA.runInTerminal) &&
-         parseEnv(Params, LRA.env, P);
+         O.mapOptional("runInTerminal", LRA.console) &&
+         O.mapOptional("console", LRA.console) && parseEnv(Params, LRA.env, P);
 }
 
 bool fromJSON(const json::Value &Params, AttachRequestArguments &ARA,
@@ -531,6 +558,41 @@ json::Value toJSON(const ModulesResponseBody &MR) {
   return result;
 }
 
+bool fromJSON(const json::Value &Param, VariablesArguments::VariablesFilter &VA,
+              json::Path Path) {
+  auto rawFilter = Param.getAsString();
+  if (!rawFilter) {
+    Path.report("expected a string");
+    return false;
+  }
+  std::optional<VariablesArguments::VariablesFilter> filter =
+      StringSwitch<std::optional<VariablesArguments::VariablesFilter>>(
+          *rawFilter)
+          .Case("indexed", VariablesArguments::eVariablesFilterIndexed)
+          .Case("named", VariablesArguments::eVariablesFilterNamed)
+          .Default(std::nullopt);
+  if (!filter) {
+    Path.report("unexpected value, expected 'named' or 'indexed'");
+    return false;
+  }
+
+  VA = *filter;
+  return true;
+}
+
+bool fromJSON(const json::Value &Param, VariablesArguments &VA,
+              json::Path Path) {
+  json::ObjectMapper O(Param, Path);
+  return O && O.map("variablesReference", VA.variablesReference) &&
+         O.mapOptional("filter", VA.filter) &&
+         O.mapOptional("start", VA.start) && O.mapOptional("count", VA.count) &&
+         O.mapOptional("format", VA.format);
+}
+
+json::Value toJSON(const VariablesResponseBody &VRB) {
+  return json::Object{{"variables", VRB.variables}};
+}
+
 bool fromJSON(const json::Value &Params, WriteMemoryArguments &WMA,
               json::Path P) {
   json::ObjectMapper O(Params, P);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 1544815be9389..07c4afbaa8700 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -242,6 +242,12 @@ struct Configuration {
   std::string platformName;
 };
 
+enum Console : unsigned {
+  eConsoleInternal,
+  eConsoleIntegratedTerminal,
+  eConsoleExternalTerminal
+};
+
 /// lldb-dap specific launch arguments.
 struct LaunchRequestArguments {
   /// Common lldb-dap configuration values for launching/attaching operations.
@@ -290,9 +296,9 @@ struct LaunchRequestArguments {
   /// Set whether to shell expand arguments to the process when launching.
   bool shellExpandArguments = false;
 
-  /// Launch the program inside an integrated terminal in the IDE. Useful for
-  /// debugging interactive command line programs.
-  bool runInTerminal = false;
+  /// Specify where to launch the program: internal console, integrated
+  /// terminal or external terminal.
+  Console console = eConsoleInternal;
 
   /// @}
 };
@@ -896,6 +902,54 @@ struct ModulesResponseBody {
 };
 llvm::json::Value toJSON(const ModulesResponseBody &);
 
+/// Arguments for `variables` request.
+struct VariablesArguments {
+  /// The variable for which to retrieve its children. The `variablesReference`
+  /// must have been obtained in the current suspended state. See 'Lifetime of
+  /// Object References' in the Overview section for details.
+  uint64_t variablesReference;
+
+  enum VariablesFilter : unsigned {
+    eVariablesFilterBoth = 0,
+    eVariablesFilterIndexed = 1 << 0,
+    eVariablesFilterNamed = 1 << 1,
+  };
+
+  /// Filter to limit the child variables to either named or indexed. If
+  /// omitted, both types are fetched.
+  VariablesFilter filter = eVariablesFilterBoth;
+
+  /// The index of the first variable to return; if omitted children start at 0.
+  ///
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsVariablePaging` is true.
+  uint64_t start = 0;
+
+  /// The number of variables to return. If count is missing or 0, all variables
+  /// are returned.
+  ///
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsVariablePaging` is true.
+  uint64_t count = 0;
+
+  /// Specifies details on how to format the Variable values.
+  ///
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsValueFormattingOptions` is true.
+  std::optional<ValueFormat> format;
+};
+bool fromJSON(const llvm::json::Value &Param,
+              VariablesArguments::VariablesFilter &VA, llvm::json::Path Path);
+bool fromJSON(const llvm::json::Value &, VariablesArguments &,
+              llvm::json::Path);
+
+/// Response to `variables` request.
+struct VariablesResponseBody {
+  /// All (or a range) of variables for the given variable reference.
+  std::vector<Variable> variables;
+};
+llvm::json::Value toJSON(const VariablesResponseBody &);
+
 /// Arguments for `writeMemory` request.
 struct WriteMemoryArguments {
   /// Memory reference to the base location to which data should be written.
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index 9b5c9ef348ca4..785830c693104 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -9,6 +9,7 @@
 #include "Protocol/ProtocolTypes.h"
 #include "JSONUtils.h"
 #include "ProtocolUtils.h"
+#include "lldb/lldb-defines.h"
 #include "lldb/lldb-types.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -953,4 +954,71 @@ json::Value toJSON(const Module &M) {
   return result;
 }
 
+json::Value toJSON(const VariablePresentationHint &VPH) {
+  json::Object result{};
+
+  if (!VPH.kind.empty())
+    result.insert({"kind", VPH.kind});
+  if (!VPH.attributes.empty())
+    result.insert({"attributes", VPH.attributes});
+  if (!VPH.visibility.empty())
+    result.insert({"visibility", VPH.visibility});
+  if (VPH.lazy)
+    result.insert({"lazy", VPH.lazy});
+
+  return result;
+}
+
+bool fromJSON(const json::Value &Param, VariablePresentationHint &VPH,
+              json::Path Path) {
+  json::ObjectMapper O(Param, Path);
+  return O && O.mapOptional("kind", VPH.kind) &&
+         O.mapOptional("attributes", VPH.attributes) &&
+         O.mapOptional("visibility", VPH.visibility) &&
+         O.mapOptional("lazy", VPH.lazy);
+}
+
+json::Value toJSON(const Variable &V) {
+  json::Object result{{"name", V.name},
+                      {"variablesReference", V.variablesReference},
+                      {"value", V.value}};
+
+  if (!V.type.empty())
+    result.insert({"type", V.type});
+  if (V.presentationHint)
+    result.insert({"presentationHint", *V.presentationHint});
+  if (!V.evaluateName.empty())
+    result.insert({"evaluateName", V.evaluateName});
+  if (V.namedVariables)
+    result.insert({"namedVariables", V.namedVariables});
+  if (V.indexedVariables)
+    result.insert({"indexedVariables", V.indexedVariables});
+  if (V.memoryReference != LLDB_INVALID_ADDRESS)
+    result.insert(
+        {"memoryReference", EncodeMemoryReference(V.memoryReference)});
+  if (V.declarationLocationReference)
+    result.insert(
+        {"declarationLocationReference", V.declarationLocationReference});
+  if (V.valueLocationReference)
+    result.insert({"valueLocationReference", V.valueLocationReference});
+
+  return result;
+}
+
+bool fromJSON(const json::Value &Param, Variable &V, json::Path Path) {
+  json::ObjectMapper O(Param, Path);
+  return O && O.map("name", V.name) &&
+         O.map("variablesReference", V.variablesReference) &&
+         O.map("value", V.value) && O.mapOptional("type", V.type) &&
+         O.mapOptional("presentationHint", *V.presentationHint) &&
+         O.mapOptional("evaluateName", V.evaluateName) &&
+         O.mapOptional("namedVariables", V.namedVariables) &&
+         O.mapOptional("indexedVariables", V.indexedVariables) &&
+         O.mapOptional("declarationLocationReference",
+                       V.declarationLocationReference) &&
+         O.mapOptional("valueLocationReference", V.valueLocationReference) &&
+         DecodeMemoryReference(Param, "memoryReference", V.memoryReference,
+                               Path, /*required=*/false);
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 2bb765e956256..89122c8f66307 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -475,7 +475,7 @@ llvm::json::Value toJSON(const Thread &);
 /// Provides formatting information for a value.
 struct ValueFormat {
   /// Display the value in hex.
-  std::optional<bool> hex;
+  bool hex = false;
 };
 bool fromJSON(const llvm::json::Value &, ValueFormat &, llvm::json::Path);
 
@@ -789,6 +789,137 @@ struct Module {
 };
 llvm::json::Value toJSON(const Module &);
 
+/// Properties of a variable that can be used to determine how to render the
+/// variable in the UI.
+struct VariablePresentationHint {
+  /// The kind of variable. Before introducing additional values, try to use the
+  /// listed values.
+  std::string kind;
+
+  /// Set of attributes represented as an array of strings. Before introducing
+  /// additional values, try to use the listed values.
+  std::vector<std::string> attributes;
+
+  /// Visibility of variable. Before introducing additional values, try to use
+  /// the listed values.
+  std::string visibility;
+
+  /// If true, clients can present the variable with a UI that supports a
+  /// specific gesture to trigger its evaluation.
+  ///
+  /// This mechanism can be used for properties that require executing code when
+  /// retrieving their value and where the code execution can be expensive
+  /// and/or produce side-effects. A typical example are properties based on a
+  /// getter function.
+  ///
+  /// Please note that in addition to the `lazy` flag, the variable's
+  /// `variablesReference` is expected to refer to a variable that will provide
+  /// the value through another `variable` request.
+  bool lazy = false;
+};
+llvm::json::Value toJSON(const VariablePresentationHint &);
+bool fromJSON(const llvm::json::Value &, VariablePresentationHint &,
+              llvm::json::Path);
+
+/// A Variable is a name/value pair.
+///
+/// The `type` attribute is shown if space permits or when hovering over the
+/// variable's name.
+///
+/// The `kind` attribute is used to render additional properties of the
+/// variable, e.g. different icons can be used to indicate that a variable is
+/// public or private.
+///
+/// If the value is structured (has children), a handle is provided to retrieve
+/// the children with the `variables` request.
+///
+/// If the number of named or indexed children is large, the numbers should be
+/// returned via the `namedVariables` and `indexedVariables` attributes.
+///
+/// The client can use this information to present the children in a paged UI
+/// and fetch them in chunks.
+struct Variable {
+  /// The variable's name.
+  std::string name;
+
+  /// The variable's value.
+  ///
+  /// This can be a multi-line text, e.g. for a function the body of a function.
+  ///
+  /// For structured variables (which do not have a simple value), it is
+  /// recommended to provide a one-line representation of the structured object.
+  /// This helps to identify the structured object in the collapsed state when
+  /// its children are not yet visible.
+  ///
+  /// An empty string can be used if no value should be shown in the UI.
+  std::string value;
+
+  /// The type of the variable's value. Typically shown in the UI when hovering
+  /// over the value.
+  ///
+  /// This attribute should only be returned by a debug adapter if the
+  /// corresponding capability `supportsVariableType` is true.
+  std::string type;
+
+  /// Properties of a variable that can be used to determine how to render the
+  /// variable in the UI.
+  std::optional<VariablePresentationHint> presentationHint;
+
+  /// The evaluatable name of this variable which can be passed to the
+  /// `evaluate` request to fetch the variable's value.
+  std::string evaluateName;
+
+  /// If `variablesReference` is > 0, the variable is structured and its
+  /// children can be retrieved by passing `variablesReference` to the
+  /// `variables` request as long as execution remains suspended. See 'Lifetime
+  /// of Object References' in the Overview section for details.
+  uint64_t variablesReference = 0;
+
+  /// The number of named child variables.
+  ///
+  /// The client can use this information to present the children in a paged UI
+  /// and fetch them in chunks.
+  uint64_t namedVariables = 0;
+
+  /// The number of indexed child variables.
+  ///
+  /// The client can use this information to present the children in a paged UI
+  /// and fetch them in chunks.
+  uint64_t indexedVariables = 0;
+
+  /// A memory reference associated with this variable.
+  ///
+  /// For pointer type variables, this is generally a reference to the memory
+  /// address contained in the pointer.
+  ///
+  /// For executable data, this reference may later be used in a `disassemble`
+  /// request.
+  ///
+  /// This attribute may be returned by a debug adapter if corresponding
+  /// capability `supportsMemoryReferences` is true.
+  lldb::addr_t memoryReference = LLDB_INVALID_ADDRESS;
+
+  /// A reference that allows the client to request the location where the
+  /// variable is declared. This should be present only if the adapter is likely
+  /// to be able to resolve the location.
+  ///
+  /// This reference shares the same lifetime as the `variablesReference`. See
+  /// 'Lifetime of Object References' in the Overview section for details.
+  uint64_t declarationLocationReference = 0;
+
+  /// A reference that allows the client to request the location where the
+  /// variable's value is declared. For example, if the variable contains a
+  /// function pointer, the adapter may be able to look up the function's
+  /// location. This should be present only if the adapter is likely to be able
+  /// to resolve the location.
+  ///
+  /// This reference shares the same lifetime as the `variablesReference`. See
+  /// 'Lifetime of Object References' in the Overview section for details.
+  uint64_t valueLocationReference = 0;
+};
+llvm::json::Value toJSON(const Variable &);
+bool fromJSON(const llvm::json::Value &, Variable &, llvm::json::Path);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index f9e373db74618..775c82fbb7716 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "ProtocolUtils.h"
+#include "JSONUtils.h"
 #include "LLDBUtils.h"
 
 #include "lldb/API/SBDebugger.h"
+#include "lldb/API/SBDeclaration.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBMutex.h"
 #include "lldb/API/SBStream.h"
@@ -227,9 +229,9 @@ std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
   return threads;
 }
 
-protocol::ExceptionBreakpointsFilter
+ExceptionBreakpointsFilter
 CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
-  protocol::ExceptionBreakpointsFilter filter;
+  ExceptionBreakpointsFilter filter;
   filter.filter = bp.GetFilter();
   filter.label = bp.GetLabel();
   filter.description = bp.GetLabel();
@@ -238,4 +240,68 @@ CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
   return filter;
 }
 
+Variable CreateVariable(lldb::SBValue v, int64_t var_ref, bool format_hex,
+                        bool auto_variable_summaries,
+                        bool synthetic_child_debugging, bool is_name_duplicated,
+                        std::optional<std::string> custom_name) {
+  VariableDescription desc(v, auto_variable_summaries, format_hex,
+                           is_name_duplicated, custom_name);
+  Variable var;
+  var.name = desc.name;
+  var.value = desc.display_value;
+  var.type = desc.display_type_name;
+
+  if (!desc.evaluate_name.empty())
+    var.evaluateName = desc.evaluate_name;
+
+  // If we have a type with many children, we would like to be able to
+  // give a hint to the IDE that the type has indexed children so that the
+  // request can be broken up in grabbing only a few children at a time. We
+  // want to be careful and only call "v.GetNumChildren()" if we have an array
+  // type or if we have a synthetic child provider producing indexed children.
+  // We don't want to call "v.GetNumChildren()" on all objects as class, struct
+  // and union types don't need to be completed if they are never expanded. So
+  // we want to avoid calling this to only cases where we it makes sense to keep
+  // performance high during normal debugging.
+
+  // If we have an array type, say that it is indexed and provide the number
+  // of children in case we have a huge array. If we don't do this, then we
+  // might take a while to produce all children at onces which can delay your
+  // debug session.
+  if (desc.type_obj.IsArrayType()) {
+    var.indexedVariables = v.GetNumChildren();
+  } else if (v.IsSynthetic()) {
+    // For a type with a synthetic child provider, the SBType of "v" won't tell
+    // us anything about what might be displayed. Instead, we check if the first
+    // child's name is "[0]" and then say it is indexed. We call
+    // GetNumChildren() only if the child name matches to avoid a potentially
+    // expensive operation.
+    if (lldb::SBValue first_child = v.GetChildAtIndex(0)) {
+      llvm::StringRef first_child_name = first_child.GetName();
+      if (first_child_name == "[0]") {
+        size_t num_children = v.GetNumChildren();
+        // If we are creating a "[raw]" fake child for each synthetic type, we
+        // have to account for it when returning indexed variables.
+        if (synthetic_child_debugging)
+          ++num_children;
+        var.indexedVariables = num_children;
+      }
+    }
+  }
+
+  if (v.MightHaveChildren())
+    var.variablesReference = var_ref;
+
+  if (v.GetDeclaration().IsValid())
+    var.declarationLocationReference = PackLocation(var_ref, false);
+
+  if (ValuePointsToCode(v))
+    var.valueLocationReference = PackLocation(var_ref, true);
+
+  if (lldb::addr_t addr = v.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
+    var.memoryReference = addr;
+
+  return var;
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h
index d906d8e881158..a1f7ae0661914 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.h
+++ b/lldb/tools/lldb-dap/ProtocolUtils.h
@@ -106,6 +106,48 @@ CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
 ///     "2 MB").
 std::string ConvertDebugInfoSizeToString(uint64_t debug_size);
 
+/// Create a protocol Variable for the given value.
+///
+/// \param[in] v
+///     The LLDB value to use when populating out the "Variable"
+///     object.
+///
+/// \param[in] var_ref
+///     The variable reference. Used to identify the value, e.g.
+///     in the `variablesReference` or `declarationLocationReference`
+///     properties.
+///
+/// \param[in] format_hex
+///     If set to true the variable will be formatted as hex in
+///     the "value" key value pair for the value of the variable.
+///
+/// \param[in] auto_variable_summaries
+///     If set to true the variable will create an automatic variable summary.
+///
+/// \param[in] synthetic_child_debugging
+///     Whether to include synthetic children when listing properties of the
+///     value.
+///
+/// \param[in] is_name_duplicated
+///     Whether the same variable name appears multiple times within the same
+///     context (e.g. locals). This can happen due to shadowed variables in
+///     nested blocks.
+///
+///     As VSCode doesn't render two of more variables with the same name, we
+///     apply a suffix to distinguish duplicated variables.
+///
+/// \param[in] custom_name
+///     A provided custom name that is used instead of the SBValue's when
+///     creating the JSON representation.
+///
+/// \return
+///     A Variable representing the given value.
+protocol::Variable CreateVariable(lldb::SBValue v, int64_t var_ref,
+                                  bool format_hex, bool auto_variable_summaries,
+                                  bool synthetic_child_debugging,
+                                  bool is_name_duplicated,
+                                  std::optional<std::string> custom_name = {});
+
 } // namespace lldb_dap
 
 #endif
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 18bfa9d518b98..f88f3ced6f25f 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -235,7 +235,8 @@ contain the following key/value pairs:
 | **cwd**                           | string      |     | The program working directory.
 | **env**                           | dictionary  |     | Environment variables to set when launching the program. The format of each environment variable string is "VAR=VALUE" for environment variables with values or just "VAR" for environment variables with no values.
 | **stopOnEntry**                   | boolean     |     | Whether to stop program immediately after launching.
-| **runInTerminal**                 | boolean     |     | Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs.
+| **runInTerminal** (deprecated)    | boolean     |     | Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs.
+| **console**                       | string      |     | Specify where to launch the program: internal console (`internalConsole`), integrated terminal (`integratedTerminal`) or external terminal (`externalTerminal`). Supported from lldb-dap 21.0 version.
 | **launchCommands**                | [string]    |     | LLDB commands executed to launch the program.
 
 For JSON configurations of `"type": "attach"`, the JSON configuration can contain
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index b150dee792c34..801abe73edd7d 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -528,7 +528,23 @@
               "runInTerminal": {
                 "type": "boolean",
                 "description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs",
-                "default": false
+                "default": false,
+                "deprecationMessage": "Attribute 'runInTerminal' is deprecated, use 'console' instead."
+              },
+              "console": {
+                "type": "string",
+                "enum": [
+                  "internalConsole",
+                  "integratedTerminal",
+                  "externalTerminal"
+                ],
+                "enumDescriptions": [
+                  "Use Debug Console for output (input is not supported).",
+                  "Launch the program inside an integrated terminal in the IDE.",
+                  "Launch the program inside an external terminal window."
+                ],
+                "description": "Specify where to launch the program: internal console, integrated terminal or external terminal.",
+                "default": "internalConsole"
               },
               "timeout": {
                 "type": "number",
diff --git a/lldb/unittests/DAP/JSONUtilsTest.cpp b/lldb/unittests/DAP/JSONUtilsTest.cpp
index 876980eb4bf4a..86ba2d171a2c0 100644
--- a/lldb/unittests/DAP/JSONUtilsTest.cpp
+++ b/lldb/unittests/DAP/JSONUtilsTest.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "JSONUtils.h"
-#include "lldb/API/SBModule.h"
-#include "lldb/API/SBTarget.h"
+#include "lldb/lldb-defines.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <optional>
 
@@ -182,3 +182,66 @@ TEST(JSONUtilsTest, GetStrings_NestedArray) {
   ASSERT_EQ(result.size(), 1UL);
   EXPECT_EQ(result[0], "string");
 }
+
+TEST(JSONUtilsTest, DecodeMemoryReference) {
+  EXPECT_EQ(DecodeMemoryReference(""), std::nullopt);
+  EXPECT_EQ(DecodeMemoryReference("123"), std::nullopt);
+  EXPECT_EQ(DecodeMemoryReference("0o123"), std::nullopt);
+  EXPECT_EQ(DecodeMemoryReference("0b1010101"), std::nullopt);
+  EXPECT_EQ(DecodeMemoryReference("0x123"), 291u);
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_TRUE(DecodeMemoryReference(json::Object{{"mem_ref", "0x123"}},
+                                      "mem_ref", addr, root,
+                                      /*required=*/true));
+    EXPECT_EQ(addr, 291u);
+  }
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_TRUE(DecodeMemoryReference(json::Object{}, "mem_ref", addr, root,
+                                      /*required=*/false));
+  }
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_FALSE(DecodeMemoryReference(json::Value{"string"}, "mem_ref", addr,
+                                       root,
+                                       /*required=*/true));
+    EXPECT_THAT_ERROR(root.getError(), FailedWithMessage("expected object"));
+  }
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_FALSE(DecodeMemoryReference(json::Object{}, "mem_ref", addr, root,
+                                       /*required=*/true));
+    EXPECT_THAT_ERROR(root.getError(),
+                      FailedWithMessage("missing value at (root).mem_ref"));
+  }
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_FALSE(DecodeMemoryReference(json::Object{{"mem_ref", 123}},
+                                       "mem_ref", addr, root,
+                                       /*required=*/true));
+    EXPECT_THAT_ERROR(root.getError(),
+                      FailedWithMessage("expected string at (root).mem_ref"));
+  }
+
+  {
+    addr_t addr = LLDB_INVALID_ADDRESS;
+    json::Path::Root root;
+    EXPECT_FALSE(DecodeMemoryReference(json::Object{{"mem_ref", "123"}},
+                                       "mem_ref", addr, root,
+                                       /*required=*/true));
+    EXPECT_THAT_ERROR(
+        root.getError(),
+        FailedWithMessage("malformed memory reference at (root).mem_ref"));
+  }
+}
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index b5cf06bd6f0b6..8add315f47036 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -883,3 +883,120 @@ TEST(ProtocolTypesTest, ModulesResponseBody) {
   ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
   EXPECT_EQ(pp(*expected), pp(response));
 }
+
+TEST(ProtocolTypesTest, VariablePresentationHint) {
+  VariablePresentationHint hint;
+  hint.kind = "kind";
+  hint.attributes = {"a", "b", "c"};
+  hint.visibility = "public";
+  hint.lazy = true;
+
+  const StringRef json = R"({
+  "attributes": [
+    "a",
+    "b",
+    "c"
+  ],
+  "kind": "kind",
+  "lazy": true,
+  "visibility": "public"
+})";
+
+  EXPECT_EQ(pp(Value(hint)), json);
+  EXPECT_THAT_EXPECTED(json::parse(json), HasValue(Value(hint)));
+}
+
+TEST(ProtocolTypesTest, Variable) {
+  Variable var;
+  var.name = "var1";
+  var.variablesReference = 42;
+  var.value = "value";
+  var.type = "type";
+
+  VariablePresentationHint hint;
+  hint.kind = "kind";
+  var.presentationHint = std::move(hint);
+  var.evaluateName = "my_name";
+  var.namedVariables = 7;
+  var.indexedVariables = 7;
+  var.memoryReference = 291u;
+  var.declarationLocationReference = 24;
+  var.valueLocationReference = 100;
+
+  const StringRef json = R"({
+  "declarationLocationReference": 24,
+  "evaluateName": "my_name",
+  "indexedVariables": 7,
+  "memoryReference": "0x123",
+  "name": "var1",
+  "namedVariables": 7,
+  "presentationHint": {
+    "kind": "kind"
+  },
+  "type": "type",
+  "value": "value",
+  "valueLocationReference": 100,
+  "variablesReference": 42
+})";
+
+  EXPECT_EQ(pp(Value(var)), json);
+  EXPECT_THAT_EXPECTED(json::parse(json), HasValue(Value(var)));
+}
+
+TEST(ProtocolTypesTest, VariablesArguments) {
+  llvm::Expected<VariablesArguments> expected = parse<VariablesArguments>(R"({
+    "variablesReference": 42,
+    "filter": "indexed",
+    "start": 10,
+    "count": 5,
+    "format": {
+      "hex": true
+    }
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->variablesReference, 42u);
+  EXPECT_EQ(expected->filter, VariablesArguments::eVariablesFilterIndexed);
+  EXPECT_EQ(expected->start, 10u);
+  EXPECT_EQ(expected->count, 5u);
+  EXPECT_EQ(expected->format->hex, true);
+
+  EXPECT_THAT_EXPECTED(
+      parse<VariablesArguments>(R"({})"),
+      FailedWithMessage("missing value at (root).variablesReference"));
+  EXPECT_THAT_EXPECTED(
+      parse<VariablesArguments>(
+          R"({"variablesReference": 42, "filter": "my-filter"})"),
+      FailedWithMessage(
+          "unexpected value, expected 'named' or 'indexed' at (root).filter"));
+}
+
+TEST(ProtocolTypesTest, VariablesResponseBody) {
+  Variable var1;
+  var1.name = "var1";
+  var1.variablesReference = 42;
+  var1.value = "<var1-value>";
+
+  Variable var2;
+  var2.name = "var2";
+  var2.variablesReference = 3;
+  var2.value = "<var2-value>";
+
+  VariablesResponseBody response{{var1, var2}};
+
+  Expected<json::Value> expected = json::parse(R"({
+      "variables": [
+        {
+          "name": "var1",
+          "value": "<var1-value>",
+          "variablesReference": 42
+        },
+        {
+          "name": "var2",
+          "value": "<var2-value>",
+          "variablesReference": 3
+        }
+      ]
+    })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected), pp(response));
+}
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 67a628d4953c3..9a2e73a1e3718 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -4,10 +4,6 @@
 # Adds the name of the generated file to TABLEGEN_OUTPUT.
 include(LLVMDistributionSupport)
 
-# Clear out any pre-existing compile_commands file before processing. This
-# allows for generating a clean compile_commands on each configure.
-file(REMOVE ${CMAKE_BINARY_DIR}/tablegen_compile_commands.yml)
-
 function(tablegen project ofn)
   cmake_parse_arguments(ARG "" "" "DEPENDS;EXTRA_INCLUDES" ${ARGN})
 
@@ -250,3 +246,11 @@ macro(add_tablegen target project)
     set_property(GLOBAL APPEND PROPERTY ${export_upper}_EXPORTS ${target})
   endif()
 endmacro()
+
+# Make sure 'tablegen_compile_commands.yml' is only deleted once the very
+# first time this file is included.
+include_guard(GLOBAL)
+
+# Clear out any pre-existing compile_commands file before processing. This
+# allows for generating a clean compile_commands on each configure.
+file(REMOVE ${CMAKE_BINARY_DIR}/tablegen_compile_commands.yml)
diff --git a/llvm/docs/Remarks.rst b/llvm/docs/Remarks.rst
index 3be66e5adac95..c89940f9ff4d5 100644
--- a/llvm/docs/Remarks.rst
+++ b/llvm/docs/Remarks.rst
@@ -57,6 +57,11 @@ Enabling optimization remarks
 There are two modes that are supported for enabling optimization remarks in
 LLVM: through remark diagnostics, or through serialized remarks.
 
+See also the clang flags
+`-Rpass <https://clang.llvm.org/docs/UsersManual.html#options-to-emit-optimization-reports>`_
+and
+`-fsave-optimization-record <http://clang.llvm.org/docs/UsersManual.html#cmdoption-f-no-save-optimization-record>`_.
+
 Remark diagnostics
 ------------------
 
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 7d1caa6438906..212c2e1c86a65 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -688,6 +688,77 @@ LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateArrayType(
     LLVMDIBuilderRef Builder, uint64_t Size, uint32_t AlignInBits,
     LLVMMetadataRef Ty, LLVMMetadataRef *Subscripts, unsigned NumSubscripts);
 
+/**
+ * Create debugging information entry for a set.
+ * \param Builder        The DIBuilder.
+ * \param Scope          The scope in which the set is defined.
+ * \param Name           A name that uniquely identifies this set.
+ * \param NameLen        The length of the C string passed to \c Name.
+ * \param File           File where the set is located.
+ * \param Line           Line number of the declaration.
+ * \param SizeInBits     Set size.
+ * \param AlignInBits    Set alignment.
+ * \param BaseTy         The base type of the set.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateSetType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMMetadataRef BaseTy);
+
+/**
+ * Create a descriptor for a subrange with dynamic bounds.
+ * \param Builder    The DIBuilder.
+ * \param Scope      The scope in which the subrange is defined.
+ * \param Name       A name that uniquely identifies this subrange.
+ * \param NameLen    The length of the C string passed to \c Name.
+ * \param LineNo     Line number.
+ * \param File       File where the subrange is located.
+ * \param SizeInBits Member size.
+ * \param AlignInBits Member alignment.
+ * \param Flags      Flags.
+ * \param BaseTy     The base type of the subrange. eg integer or enumeration
+ * \param LowerBound Lower bound of the subrange.
+ * \param UpperBound Upper bound of the subrange.
+ * \param Stride     Stride of the subrange.
+ * \param Bias       Bias of the subrange.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateSubrangeType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned LineNo, LLVMMetadataRef File, uint64_t SizeInBits,
+    uint32_t AlignInBits, LLVMDIFlags Flags, LLVMMetadataRef BaseTy,
+    LLVMMetadataRef LowerBound, LLVMMetadataRef UpperBound,
+    LLVMMetadataRef Stride, LLVMMetadataRef Bias);
+
+/**
+ * Create debugging information entry for a dynamic array.
+ * \param Builder      The DIBuilder.
+ * \param Size         Array size.
+ * \param AlignInBits  Alignment.
+ * \param Ty           Element type.
+ * \param Subscripts   Subscripts.
+ * \param NumSubscripts Number of subscripts.
+ * \param DataLocation DataLocation. (DIVariable, DIExpression or NULL)
+ * \param Associated   Associated. (DIVariable, DIExpression or NULL)
+ * \param Allocated    Allocated. (DIVariable, DIExpression or NULL)
+ * \param Rank         Rank. (DIVariable, DIExpression or NULL)
+ * \param BitStride    BitStride.
+ */
+LLVMMetadataRef LLVMDIBuilderCreateDynamicArrayType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned LineNo, LLVMMetadataRef File, uint64_t Size,
+    uint32_t AlignInBits, LLVMMetadataRef Ty, LLVMMetadataRef *Subscripts,
+    unsigned NumSubscripts, LLVMMetadataRef DataLocation,
+    LLVMMetadataRef Associated, LLVMMetadataRef Allocated, LLVMMetadataRef Rank,
+    LLVMMetadataRef BitStride);
+
+/**
+ * Replace arrays.
+ *
+ * @see DIBuilder::replaceArrays()
+ */
+void LLVMReplaceArrays(LLVMDIBuilderRef Builder, LLVMMetadataRef *T,
+                       LLVMMetadataRef *Elements, unsigned NumElements);
+
 /**
  * Create debugging information entry for a vector type.
  * \param Builder      The DIBuilder.
diff --git a/llvm/include/llvm/ADT/DenseMapInfo.h b/llvm/include/llvm/ADT/DenseMapInfo.h
index 07c37e353a40b..b850223c953da 100644
--- a/llvm/include/llvm/ADT/DenseMapInfo.h
+++ b/llvm/include/llvm/ADT/DenseMapInfo.h
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -320,6 +321,28 @@ struct DenseMapInfo<Enum, std::enable_if_t<std::is_enum_v<Enum>>> {
 
   static bool isEqual(const Enum &LHS, const Enum &RHS) { return LHS == RHS; }
 };
+
+template <typename T> struct DenseMapInfo<std::optional<T>> {
+  using Optional = std::optional<T>;
+  using Info = DenseMapInfo<T>;
+
+  static inline Optional getEmptyKey() { return {Info::getEmptyKey()}; }
+
+  static inline Optional getTombstoneKey() { return {Info::getTombstoneKey()}; }
+
+  static unsigned getHashValue(const Optional &OptionalVal) {
+    return detail::combineHashValue(
+        OptionalVal.has_value(),
+        Info::getHashValue(OptionalVal.value_or(Info::getEmptyKey())));
+  }
+
+  static bool isEqual(const Optional &LHS, const Optional &RHS) {
+    if (LHS && RHS) {
+      return Info::isEqual(LHS.value(), RHS.value());
+    }
+    return !LHS && !RHS;
+  }
+};
 } // end namespace llvm
 
 #endif // LLVM_ADT_DENSEMAPINFO_H
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 1415da14a3494..73bfe1aabb4e0 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -423,6 +423,11 @@ class MemoryDepChecker {
   getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
                                      const MemAccessInfo &B,
                                      Instruction *BInst);
+
+  // Return true if we can prove that \p Sink only accesses memory after \p
+  // Src's end or vice versa.
+  bool areAccessesCompletelyBeforeOrAfter(const SCEV *Src, Type *SrcTy,
+                                          const SCEV *Sink, Type *SinkTy);
 };
 
 class RuntimePointerChecking;
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index e9b573733451b..5c81c48a80303 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -93,6 +94,18 @@ class BitcodeWriter {
                             bool GenerateHash = false,
                             ModuleHash *ModHash = nullptr);
 
+
+
+//This is for Writing Backword Compatible Bitcode with Non-Opaque types
+
+LLVM_ABI void writeBitcodeWithNonOpaqueTypes(const Module &M,
+                                             bool ShouldPreserveUseListOrder = false,
+                                             const ModuleSummaryIndex *Index = nullptr,
+                                             bool GenerateHash = false,
+                                             ModuleHash *ModHash = nullptr,
+                                             bool WriteNonOpaqueTypes = true,
+                                             DenseMap<const Value *, Type *> *NonOpaqueTypeMap = nullptr);
+
   /// Write the specified thin link bitcode file (i.e., the minimized bitcode
   /// file) to the buffer specified at construction time. The thin link
   /// bitcode file is used for thin link, and it only contains the necessary
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 08e6a0e3ef629..f0cfa7663c5fa 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -27,12 +26,13 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/UniqueBBID.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 // This struct represents the cluster information for a machine basic block,
-// which is specifed by a unique ID (`MachineBasicBlock::BBID`).
+// which is specifed by a unique basic block ID.
 struct BBClusterInfo {
   // Basic block ID.
   UniqueBBID BBID;
@@ -52,27 +52,6 @@ struct FunctionPathAndClusterInfo {
   SmallVector<SmallVector<unsigned>> ClonePaths;
 };
 
-// Provides DenseMapInfo for UniqueBBID.
-template <> struct DenseMapInfo<UniqueBBID> {
-  static inline UniqueBBID getEmptyKey() {
-    unsigned EmptyKey = DenseMapInfo<unsigned>::getEmptyKey();
-    return UniqueBBID{EmptyKey, EmptyKey};
-  }
-  static inline UniqueBBID getTombstoneKey() {
-    unsigned TombstoneKey = DenseMapInfo<unsigned>::getTombstoneKey();
-    return UniqueBBID{TombstoneKey, TombstoneKey};
-  }
-  static unsigned getHashValue(const UniqueBBID &Val) {
-    std::pair<unsigned, unsigned> PairVal =
-        std::make_pair(Val.BaseID, Val.CloneID);
-    return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
-  }
-  static bool isEqual(const UniqueBBID &LHS, const UniqueBBID &RHS) {
-    return DenseMapInfo<unsigned>::isEqual(LHS.BaseID, RHS.BaseID) &&
-           DenseMapInfo<unsigned>::isEqual(LHS.CloneID, RHS.CloneID);
-  }
-};
-
 class BasicBlockSectionsProfileReader {
 public:
   friend class BasicBlockSectionsProfileReaderWrapperPass;
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index fefd36ec54ae2..f6936b98bf3e4 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1773,6 +1773,39 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         }
       }
 
+      if (ICA.getID() == Intrinsic::vp_scatter) {
+        if (ICA.isTypeBasedOnly()) {
+          IntrinsicCostAttributes MaskedScatter(
+              *VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()),
+              ICA.getReturnType(), ArrayRef(ICA.getArgTypes()).drop_back(1),
+              ICA.getFlags());
+          return getTypeBasedIntrinsicInstrCost(MaskedScatter, CostKind);
+        }
+        Align Alignment;
+        if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+          Alignment = VPI->getPointerAlignment().valueOrOne();
+        bool VarMask = isa<Constant>(ICA.getArgs()[2]);
+        return thisT()->getGatherScatterOpCost(
+            Instruction::Store, ICA.getArgTypes()[0], ICA.getArgs()[1], VarMask,
+            Alignment, CostKind, nullptr);
+      }
+      if (ICA.getID() == Intrinsic::vp_gather) {
+        if (ICA.isTypeBasedOnly()) {
+          IntrinsicCostAttributes MaskedGather(
+              *VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()),
+              ICA.getReturnType(), ArrayRef(ICA.getArgTypes()).drop_back(1),
+              ICA.getFlags());
+          return getTypeBasedIntrinsicInstrCost(MaskedGather, CostKind);
+        }
+        Align Alignment;
+        if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
+          Alignment = VPI->getPointerAlignment().valueOrOne();
+        bool VarMask = isa<Constant>(ICA.getArgs()[1]);
+        return thisT()->getGatherScatterOpCost(
+            Instruction::Load, ICA.getReturnType(), ICA.getArgs()[0], VarMask,
+            Alignment, CostKind, nullptr);
+      }
+
       if (ICA.getID() == Intrinsic::vp_select ||
           ICA.getID() == Intrinsic::vp_merge) {
         TTI::OperandValueInfo OpInfoX, OpInfoY;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 7a598bb77b356..756c0b24a6f8b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -2424,6 +2424,11 @@ class LLVM_ABI MachineIRBuilder {
     return buildInstr(TargetOpcode::G_RESET_FPMODE, {}, {});
   }
 
+  /// Build and insert \p Dst = G_GET_ROUNDING
+  MachineInstrBuilder buildGetRounding(const DstOp &Dst) {
+    return buildInstr(TargetOpcode::G_GET_ROUNDING, {Dst}, {});
+  }
+
   virtual MachineInstrBuilder
   buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps, ArrayRef<SrcOp> SrcOps,
              std::optional<unsigned> Flags = std::nullopt);
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 3d2da01f2c856..938d71dd030e8 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -25,6 +25,7 @@
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/UniqueBBID.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -99,13 +100,6 @@ template <> struct DenseMapInfo<MBBSectionID> {
   }
 };
 
-// This structure represents the information for a basic block pertaining to
-// the basic block sections profile.
-struct UniqueBBID {
-  unsigned BaseID;
-  unsigned CloneID;
-};
-
 template <> struct ilist_traits<MachineInstr> {
 private:
   friend class MachineBasicBlock; // Set by the owning MachineBasicBlock.
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index e4e794c434adb..e50443d25cc60 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -120,14 +120,17 @@ class SwingSchedulerDDGEdge {
   SUnit *Dst = nullptr;
   SDep Pred;
   unsigned Distance = 0;
+  bool IsValidationOnly = false;
 
 public:
   /// Creates an edge corresponding to an edge represented by \p PredOrSucc and
   /// \p Dep in the original DAG. This pair has no information about the
   /// direction of the edge, so we need to pass an additional argument \p
   /// IsSucc.
-  SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc)
-      : Dst(PredOrSucc), Pred(Dep), Distance(0u) {
+  SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc,
+                        bool IsValidationOnly)
+      : Dst(PredOrSucc), Pred(Dep), Distance(0u),
+        IsValidationOnly(IsValidationOnly) {
     SUnit *Src = Dep.getSUnit();
 
     if (IsSucc) {
@@ -188,6 +191,10 @@ class SwingSchedulerDDGEdge {
   /// functions. We ignore the back-edge recurrence in order to avoid unbounded
   /// recursion in the calculation of the ASAP, ALAP, etc functions.
   bool ignoreDependence(bool IgnoreAnti) const;
+
+  /// Returns true if this edge is intended to be used only for validating the
+  /// schedule.
+  bool isValidationOnly() const { return IsValidationOnly; }
 };
 
 /// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't
@@ -208,25 +215,21 @@ struct LoopCarriedEdges {
     return &Ite->second;
   }
 
-  /// Retruns true if the edge from \p From to \p To is a back-edge that should
-  /// be used when scheduling.
-  bool shouldUseWhenScheduling(const SUnit *From, const SUnit *To) const;
-
   /// Adds some edges to the original DAG that correspond to loop-carried
   /// dependencies. Historically, loop-carried edges are represented by using
   /// non-loop-carried edges in the original DAG. This function appends such
   /// edges to preserve the previous behavior.
-  void modifySUnits(std::vector<SUnit> &SUnits);
+  void modifySUnits(std::vector<SUnit> &SUnits, const TargetInstrInfo *TII);
 
   void dump(SUnit *SU, const TargetRegisterInfo *TRI,
             const MachineRegisterInfo *MRI) const;
 };
 
-/// Represents dependencies between instructions. This class is a wrapper of
-/// `SUnits` and its dependencies to manipulate back-edges in a natural way.
-/// Currently it only supports back-edges via PHI, which are expressed as
-/// anti-dependencies in the original DAG.
-/// FIXME: Support any other loop-carried dependencies
+/// This class provides APIs to retrieve edges from/to an SUnit node, with a
+/// particular focus on loop-carried dependencies. Since SUnit is not designed
+/// to represent such edges, handling them directly using its APIs has required
+/// non-trivial logic in the past. This class serves as a wrapper around SUnit,
+/// offering a simpler interface for managing these dependencies.
 class SwingSchedulerDDG {
   using EdgesType = SmallVector<SwingSchedulerDDGEdge, 4>;
 
@@ -244,17 +247,26 @@ class SwingSchedulerDDG {
   SwingSchedulerDDGEdges EntrySUEdges;
   SwingSchedulerDDGEdges ExitSUEdges;
 
+  /// Edges that are used only when validating the schedule. These edges are
+  /// not considered to drive the optimization heuristics.
+  SmallVector<SwingSchedulerDDGEdge, 8> ValidationOnlyEdges;
+
+  /// Adds a NON-validation-only edge to the DDG. Assumes to be called only by
+  /// the ctor.
   void addEdge(const SUnit *SU, const SwingSchedulerDDGEdge &Edge);
 
   SwingSchedulerDDGEdges &getEdges(const SUnit *SU);
   const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const;
 
 public:
-  SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU);
+  SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
+                    const LoopCarriedEdges &LCE);
 
   const EdgesType &getInEdges(const SUnit *SU) const;
 
   const EdgesType &getOutEdges(const SUnit *SU) const;
+
+  bool isValidSchedule(const SMSchedule &Schedule) const;
 };
 
 /// This class builds the dependence graph for the instructions in a loop,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b856b4786573b..657951ddafd4f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2488,8 +2488,7 @@ class SelectionDAG {
 
   /// Check if a value \op N is a constant using the target's BooleanContent for
   /// its type.
-  LLVM_ABI std::optional<bool>
-  isBoolConstant(SDValue N, bool AllowTruncation = false) const;
+  LLVM_ABI std::optional<bool> isBoolConstant(SDValue N) const;
 
   /// Set CallSiteInfo to be associated with Node.
   void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c9e5d9999138f..a248eb7444b20 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4371,6 +4371,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
            Op.getOpcode() == ISD::SPLAT_VECTOR_PARTS;
   }
 
+  /// Return true if the given select/vselect should be considered canonical and
+  /// not be transformed. Currently only used for "vselect (not Cond), N1, N2 ->
+  /// vselect Cond, N2, N1".
+  virtual bool isTargetCanonicalSelect(SDNode *N) const { return false; }
+
   struct DAGCombinerInfo {
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index f2610011a7e04..1b94657dfae1e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -955,11 +955,12 @@ def OMP_Parallel : Directive<[Spelling<"parallel">]> {
     VersionedClause<OMPC_Reduction>,
     VersionedClause<OMPC_Shared>,
   ];
-  let allowedOnceClauses = [
-    VersionedClause<OMPC_Default>,
-    VersionedClause<OMPC_If>,
-    VersionedClause<OMPC_NumThreads>,
-    VersionedClause<OMPC_ProcBind>,
+  let allowedOnceClauses = [VersionedClause<OMPC_Default>,
+                            VersionedClause<OMPC_If>,
+                            VersionedClause<OMPC_NumThreads>,
+                            VersionedClause<OMPC_ProcBind>,
+                            VersionedClause<OMPC_Message, 60>,
+                            VersionedClause<OMPC_Severity, 60>,
   ];
   let association = AS_Block;
   let category = CA_Executable;
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 16885f331e9dd..8016757cf0f3c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3510,6 +3510,18 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
 // gfx1250 intrinsics
 // ===----------------------------------------------------------------------===//
 
+// Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
+// modeled as InaccessibleMem.
+class AMDGPUWaitAsyncIntrinsic :
+  Intrinsic<[], [llvm_i16_ty],
+  [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>, IntrWillReturn, IntrNoCallback,
+   IntrNoFree]>;
+
+def int_amdgcn_s_wait_asynccnt :
+    ClangBuiltin<"__builtin_amdgcn_s_wait_asynccnt">, AMDGPUWaitAsyncIntrinsic;
+def int_amdgcn_s_wait_tensorcnt :
+    ClangBuiltin<"__builtin_amdgcn_s_wait_tensorcnt">, AMDGPUWaitAsyncIntrinsic;
+
 def int_amdgcn_ds_atomic_async_barrier_arrive_b64 :
   ClangBuiltin<"__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64">,
   Intrinsic<[], [local_ptr_ty],
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index cd676e1661d62..c236e698759cc 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -387,6 +387,34 @@ def HEXAGON_MEMCPY_LIKELY_ALIGNED_MIN32BYTES_MULT8BYTES : RuntimeLibcall;
 // XCore calls
 def MEMCPY_ALIGN_4 : RuntimeLibcall;
 
+// Objective-C calls
+def OBJC_AUTORELEASE : RuntimeLibcall;
+def OBJC_AUTORELEASEPOOLPOP : RuntimeLibcall;
+def OBJC_AUTORELEASEPOOLPUSH : RuntimeLibcall;
+def OBJC_AUTORELEASERETURNVALUE : RuntimeLibcall;
+def OBJC_COPYWEAK : RuntimeLibcall;
+def OBJC_DESTROYWEAK : RuntimeLibcall;
+def OBJC_INITWEAK : RuntimeLibcall;
+def OBJC_LOADWEAK : RuntimeLibcall;
+def OBJC_LOADWEAKRETAINED : RuntimeLibcall;
+def OBJC_MOVEWEAK : RuntimeLibcall;
+def OBJC_RELEASE : RuntimeLibcall;
+def OBJC_RETAIN : RuntimeLibcall;
+def OBJC_RETAINAUTORELEASE : RuntimeLibcall;
+def OBJC_RETAINAUTORELEASERETURNVALUE : RuntimeLibcall;
+def OBJC_RETAINAUTORELEASEDRETURNVALUE : RuntimeLibcall;
+def OBJC_CLAIMAUTORELEASEDRETURNVALUE : RuntimeLibcall;
+def OBJC_RETAINBLOCK : RuntimeLibcall;
+def OBJC_STORESTRONG : RuntimeLibcall;
+def OBJC_STOREWEAK : RuntimeLibcall;
+def OBJC_UNSAFECLAIMAUTORELEASEDRETURNVALUE : RuntimeLibcall;
+def OBJC_RETAINEDOBJECT : RuntimeLibcall;
+def OBJC_UNRETAINEDOBJECT : RuntimeLibcall;
+def OBJC_UNRETAINEDPOINTER : RuntimeLibcall;
+def OBJC_RETAIN_AUTORELEASE : RuntimeLibcall;
+def OBJC_SYNC_ENTER : RuntimeLibcall;
+def OBJC_SYNC_EXIT : RuntimeLibcall;
+
 //--------------------------------------------------------------------
 // Define implementation default libcalls
 //--------------------------------------------------------------------
@@ -1032,6 +1060,37 @@ defvar LibmHasSinCosF80 = LibcallImpls<(add sincos_f80), hasSinCos>;
 defvar LibmHasSinCosF128 = LibcallImpls<(add sincos_f128), hasSinCos>;
 defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincos_ppcf128), hasSinCos>;
 
+//===----------------------------------------------------------------------===//
+// Objective-C Runtime Libcalls
+//===----------------------------------------------------------------------===//
+
+def objc_autorelease : RuntimeLibcallImpl<OBJC_AUTORELEASE>;
+def objc_autoreleasePoolPop : RuntimeLibcallImpl<OBJC_AUTORELEASEPOOLPOP>;
+def objc_autoreleasePoolPush : RuntimeLibcallImpl<OBJC_AUTORELEASEPOOLPUSH>;
+def objc_autoreleaseReturnValue : RuntimeLibcallImpl<OBJC_AUTORELEASERETURNVALUE>;
+def objc_copyWeak : RuntimeLibcallImpl<OBJC_COPYWEAK>;
+def objc_destroyWeak : RuntimeLibcallImpl<OBJC_DESTROYWEAK>;
+def objc_initWeak : RuntimeLibcallImpl<OBJC_INITWEAK>;
+def objc_loadWeak : RuntimeLibcallImpl<OBJC_LOADWEAK>;
+def objc_loadWeakRetained : RuntimeLibcallImpl<OBJC_LOADWEAKRETAINED>;
+def objc_moveWeak : RuntimeLibcallImpl<OBJC_MOVEWEAK>;
+def objc_release : RuntimeLibcallImpl<OBJC_RELEASE>;
+def objc_retain : RuntimeLibcallImpl<OBJC_RETAIN>;
+def objc_retainAutorelease : RuntimeLibcallImpl<OBJC_RETAINAUTORELEASE>;
+def objc_retainAutoreleaseReturnValue : RuntimeLibcallImpl<OBJC_RETAINAUTORELEASERETURNVALUE>;
+def objc_retainAutoreleasedReturnValue : RuntimeLibcallImpl<OBJC_RETAINAUTORELEASEDRETURNVALUE>;
+def objc_claimAutoreleasedReturnValue : RuntimeLibcallImpl<OBJC_CLAIMAUTORELEASEDRETURNVALUE>;
+def objc_retainBlock : RuntimeLibcallImpl<OBJC_RETAINBLOCK>;
+def objc_storeStrong : RuntimeLibcallImpl<OBJC_STORESTRONG>;
+def objc_storeWeak : RuntimeLibcallImpl<OBJC_STOREWEAK>;
+def objc_unsafeClaimAutoreleasedReturnValue : RuntimeLibcallImpl<OBJC_UNSAFECLAIMAUTORELEASEDRETURNVALUE>;
+def objc_retainedObject : RuntimeLibcallImpl<OBJC_RETAINEDOBJECT>;
+def objc_unretainedObject : RuntimeLibcallImpl<OBJC_UNRETAINEDOBJECT>;
+def objc_unretainedPointer : RuntimeLibcallImpl<OBJC_UNRETAINEDPOINTER>;
+def objc_retain_autorelease : RuntimeLibcallImpl<OBJC_RETAIN_AUTORELEASE>;
+def objc_sync_enter : RuntimeLibcallImpl<OBJC_SYNC_ENTER>;
+def objc_sync_exit : RuntimeLibcallImpl<OBJC_SYNC_EXIT>;
+
 //===----------------------------------------------------------------------===//
 // AArch64 Runtime Libcalls
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index d2d0f22309fd0..f22b376c3ab5b 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -18,6 +18,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/UniqueBBID.h"
 #include <cassert>
 #include <cstdint>
 #include <cstring>
@@ -926,6 +927,8 @@ struct BBAddrMap {
         : ID(ID), Offset(Offset), Size(Size), MD(MD),
           CallsiteOffsets(std::move(CallsiteOffsets)) {}
 
+    UniqueBBID getID() const { return {ID, 0}; }
+
     bool operator==(const BBEntry &Other) const {
       return ID == Other.ID && Offset == Other.Offset && Size == Other.Size &&
              MD == Other.MD && CallsiteOffsets == Other.CallsiteOffsets;
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 574f9508420a0..02a3194e09784 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -223,10 +223,11 @@ struct packed_endian_specific_integral {
 
   explicit packed_endian_specific_integral(value_type val) { *this = val; }
 
-  operator value_type() const {
+  value_type value() const {
     return endian::read<value_type, endian, alignment>(
       (const void*)Value.buffer);
   }
+  operator value_type() const { return value(); }
 
   void operator=(value_type newValue) {
     endian::write<value_type, endian, alignment>(
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 6ba0290cc77a6..b905576b61791 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -744,6 +744,8 @@ HANDLE_TARGET_OPCODE(G_GET_FPMODE)
 HANDLE_TARGET_OPCODE(G_SET_FPMODE)
 HANDLE_TARGET_OPCODE(G_RESET_FPMODE)
 
+HANDLE_TARGET_OPCODE(G_GET_ROUNDING)
+
 /// Generic pointer offset
 HANDLE_TARGET_OPCODE(G_PTR_ADD)
 
diff --git a/llvm/include/llvm/Support/UniqueBBID.h b/llvm/include/llvm/Support/UniqueBBID.h
new file mode 100644
index 0000000000000..a5715cd107629
--- /dev/null
+++ b/llvm/include/llvm/Support/UniqueBBID.h
@@ -0,0 +1,50 @@
+//===- llvm/Support/UniqueBBID.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unique fixed ID assigned to basic blocks upon their creation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_UNIQUEBBID_H
+#define LLVM_SUPPORT_UNIQUEBBID_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+
+namespace llvm {
+
+// This structure represents the information for a basic block pertaining to
+// the basic block sections profile.
+struct UniqueBBID {
+  unsigned BaseID;
+  unsigned CloneID;
+};
+
+// Provides DenseMapInfo for UniqueBBID.
+template <> struct DenseMapInfo<UniqueBBID> {
+  static inline UniqueBBID getEmptyKey() {
+    unsigned EmptyKey = DenseMapInfo<unsigned>::getEmptyKey();
+    return UniqueBBID{EmptyKey, EmptyKey};
+  }
+  static inline UniqueBBID getTombstoneKey() {
+    unsigned TombstoneKey = DenseMapInfo<unsigned>::getTombstoneKey();
+    return UniqueBBID{TombstoneKey, TombstoneKey};
+  }
+  static unsigned getHashValue(const UniqueBBID &Val) {
+    std::pair<unsigned, unsigned> PairVal =
+        std::make_pair(Val.BaseID, Val.CloneID);
+    return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
+  }
+  static bool isEqual(const UniqueBBID &LHS, const UniqueBBID &RHS) {
+    return DenseMapInfo<unsigned>::isEqual(LHS.BaseID, RHS.BaseID) &&
+           DenseMapInfo<unsigned>::isEqual(LHS.CloneID, RHS.CloneID);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_UNIQUEBBID_H
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index bcf49b448e782..ce4750db88c9a 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1267,6 +1267,12 @@ def G_READSTEADYCOUNTER : GenericInstruction {
   let hasSideEffects = true;
 }
 
+def G_GET_ROUNDING : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins);
+  let hasSideEffects = true;
+}
+
 //------------------------------------------------------------------------------
 // Memory ops
 //------------------------------------------------------------------------------
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 5b85060f9caa1..428342f51ad2e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -187,6 +187,20 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
           OS << "  da analyze - ";
           if (auto D = DA->depends(&*SrcI, &*DstI,
                                    /*UnderRuntimeAssumptions=*/true)) {
+
+#ifndef NDEBUG
+            // Verify that the distance being zero is equivalent to the
+            // direction being EQ.
+            for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
+              const SCEV *Distance = D->getDistance(Level);
+              bool IsDistanceZero = Distance && Distance->isZero();
+              bool IsDirectionEQ =
+                  D->getDirection(Level) == Dependence::DVEntry::EQ;
+              assert(IsDistanceZero == IsDirectionEQ &&
+                     "Inconsistent distance and direction.");
+            }
+#endif
+
             // Normalize negative direction vectors if required by clients.
             if (NormalizeResults && D->normalize(&SE))
                 OS << "normalized - ";
@@ -3991,6 +4005,28 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     if (CompleteLoops[II])
       Result.DV[II - 1].Scalar = false;
 
+  // Set the distance to zero if the direction is EQ.
+  // TODO: Ideally, the distance should be set to 0 immediately simultaneously
+  // with the corresponding direction being set to EQ.
+  for (unsigned II = 1; II <= Result.getLevels(); ++II) {
+    if (Result.getDirection(II) == Dependence::DVEntry::EQ) {
+      if (Result.DV[II - 1].Distance == nullptr)
+        Result.DV[II - 1].Distance = SE->getZero(SrcSCEV->getType());
+      else
+        assert(Result.DV[II - 1].Distance->isZero() &&
+               "Inconsistency between distance and direction");
+    }
+
+#ifndef NDEBUG
+    // Check that the converse (i.e., if the distance is zero, then the
+    // direction is EQ) holds.
+    const SCEV *Distance = Result.getDistance(II);
+    if (Distance && Distance->isZero())
+      assert(Result.getDirection(II) == Dependence::DVEntry::EQ &&
+             "Distance is zero, but direction is not EQ");
+#endif
+  }
+
   if (PossiblyLoopIndependent) {
     // Make sure the LoopIndependent flag is set correctly.
     // All directions must include equal, otherwise no
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7b71dad23948c..be6ffdbb1aabe 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1954,6 +1954,37 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
   return Distance % Stride;
 }
 
+bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
+                                                          Type *SrcTy,
+                                                          const SCEV *Sink,
+                                                          Type *SinkTy) {
+  const SCEV *BTC = PSE.getBackedgeTakenCount();
+  const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
+  ScalarEvolution &SE = *PSE.getSE();
+  const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
+      InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+  if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_))
+    return false;
+
+  const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
+      InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+  if (isa<SCEVCouldNotCompute>(SinkStart_) ||
+      isa<SCEVCouldNotCompute>(SinkEnd_))
+    return false;
+
+  if (!LoopGuards)
+    LoopGuards.emplace(ScalarEvolution::LoopGuards::collect(InnermostLoop, SE));
+
+  auto SrcEnd = SE.applyLoopGuards(SrcEnd_, *LoopGuards);
+  auto SinkStart = SE.applyLoopGuards(SinkStart_, *LoopGuards);
+  if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
+    return true;
+
+  auto SinkEnd = SE.applyLoopGuards(SinkEnd_, *LoopGuards);
+  auto SrcStart = SE.applyLoopGuards(SrcStart_, *LoopGuards);
+  return SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart);
+}
+
 std::variant<MemoryDepChecker::Dependence::DepType,
              MemoryDepChecker::DepDistanceStrideAndSizeInfo>
 MemoryDepChecker::getDependenceDistanceStrideAndSize(
@@ -2001,37 +2032,13 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
                     << ": " << *Dist << "\n");
 
-  // Check if we can prove that Sink only accesses memory after Src's end or
-  // vice versa. At the moment this is limited to cases where either source or
+  // At the moment this is limited to cases where either source or
   // sink are loop invariant to avoid compile-time increases. This is not
   // required for correctness.
   if (SE.isLoopInvariant(Src, InnermostLoop) ||
       SE.isLoopInvariant(Sink, InnermostLoop)) {
-    const SCEV *BTC = PSE.getBackedgeTakenCount();
-    const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
-    const auto &[SrcStart_, SrcEnd_] =
-        getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
-    const auto &[SinkStart_, SinkEnd_] =
-        getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
-    if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
-        !isa<SCEVCouldNotCompute>(SrcEnd_) &&
-        !isa<SCEVCouldNotCompute>(SinkStart_) &&
-        !isa<SCEVCouldNotCompute>(SinkEnd_)) {
-      if (!LoopGuards)
-        LoopGuards.emplace(
-            ScalarEvolution::LoopGuards::collect(InnermostLoop, SE));
-      auto SrcEnd = SE.applyLoopGuards(SrcEnd_, *LoopGuards);
-      auto SinkStart = SE.applyLoopGuards(SinkStart_, *LoopGuards);
-      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
-        return MemoryDepChecker::Dependence::NoDep;
-
-      auto SinkEnd = SE.applyLoopGuards(SinkEnd_, *LoopGuards);
-      auto SrcStart = SE.applyLoopGuards(SrcStart_, *LoopGuards);
-      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
-        return MemoryDepChecker::Dependence::NoDep;
-    }
+    if (areAccessesCompletelyBeforeOrAfter(Src, ATy, Sink, BTy))
+      return Dependence::NoDep;
   }
 
   // Need accesses with constant strides and the same direction for further
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 1c66f5c877f59..24adfa346c642 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -601,6 +601,9 @@ static int CompareValueComplexity(const LoopInfo *const LI, Value *LV,
   if (const auto *LGV = dyn_cast<GlobalValue>(LV)) {
     const auto *RGV = cast<GlobalValue>(RV);
 
+    if (auto L = LGV->getLinkage() - RGV->getLinkage())
+      return L;
+
     const auto IsGVNameSemantic = [&](const GlobalValue *GV) {
       auto LT = GV->getLinkage();
       return !(GlobalValue::isPrivateLinkage(LT) ||
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2101fdfacfc8f..15107c262980c 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -146,12 +146,12 @@ char UniformityInfoWrapperPass::ID = 0;
 UniformityInfoWrapperPass::UniformityInfoWrapperPass() : FunctionPass(ID) {}
 
 INITIALIZE_PASS_BEGIN(UniformityInfoWrapperPass, "uniformity",
-                      "Uniformity Analysis", true, true)
+                      "Uniformity Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(UniformityInfoWrapperPass, "uniformity",
-                    "Uniformity Analysis", true, true)
+                    "Uniformity Analysis", false, true)
 
 void UniformityInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7e0d81ff4b196..617b4dffa2199 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -56,6 +56,7 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/TypedPointerType.h"
 #include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -315,7 +316,7 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase {
   /// Emit the current module to the bitstream.
   void write();
 
-private:
+protected:
   uint64_t bitcodeStartBit() { return BitcodeStartBit; }
 
   size_t addToStrtab(StringRef Str);
@@ -5567,6 +5568,986 @@ void BitcodeWriter::writeIndex(
   IndexWriter.write();
 }
 
+
+
+
+
+
+class NonOpaqueTypeModuleWriter : public ModuleBitcodeWriter {
+
+  public:
+    NonOpaqueTypeModuleWriter(const Module &M, StringTableBuilder &StrtabBuilder,
+                        BitstreamWriter &Stream,
+                        bool ShouldPreserveUseListOrder,
+                        const ModuleSummaryIndex &Index,
+                        bool GenerateHash,
+                        const ModuleHash &ModHash,
+                        DenseMap<const Value *, Type *> *NonOpaqueTypeMap)
+      : ModuleBitcodeWriter(M, StrtabBuilder, Stream,
+                            ShouldPreserveUseListOrder, &Index,
+                            GenerateHash, const_cast<ModuleHash*>(&ModHash)),
+        NonOpaqueTypeMap(NonOpaqueTypeMap) {}
+
+  void write();
+
+
+
+  private:
+  DenseMap<const Value *, Type *> *NonOpaqueTypeMap;
+  void writeTypeTable();
+  void writeFunction(const Function &F, DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
+  void writeInstruction(const Instruction &I, unsigned InstID, SmallVectorImpl<unsigned> &Vals);
+};
+
+
+void NonOpaqueTypeModuleWriter::writeTypeTable() {
+  const ValueEnumerator::TypeList &TypeList = VE.getTypes();
+
+  Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
+  SmallVector<uint64_t, 64> TypeVals;
+
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndices();
+
+
+  // Abbrev for TYPE_CODE_POINTER.
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0
+  unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_FUNCTION.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_ANON.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_NAME.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  unsigned StructNameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_STRUCT_NAMED.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Abbrev for TYPE_CODE_ARRAY.
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  // Emit an entry count so the reader can reserve space.
+  TypeVals.push_back(TypeList.size());
+  Stream.EmitRecord(bitc::TYPE_CODE_NUMENTRY, TypeVals);
+  TypeVals.clear();
+
+  // Loop over all of the types, emitting each in turn.
+  for (Type *T : TypeList) {
+    int AbbrevToUse = 0;
+    unsigned Code = 0;
+
+    switch (T->getTypeID()) {
+    case Type::VoidTyID:      Code = bitc::TYPE_CODE_VOID;      break;
+    case Type::HalfTyID:      Code = bitc::TYPE_CODE_HALF;      break;
+    case Type::BFloatTyID:    Code = bitc::TYPE_CODE_BFLOAT;    break;
+    case Type::FloatTyID:     Code = bitc::TYPE_CODE_FLOAT;     break;
+    case Type::DoubleTyID:    Code = bitc::TYPE_CODE_DOUBLE;    break;
+    case Type::X86_FP80TyID:  Code = bitc::TYPE_CODE_X86_FP80;  break;
+    case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
+    case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
+    case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
+    case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
+    case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
+    case Type::IntegerTyID:
+      // INTEGER: [width]
+      Code = bitc::TYPE_CODE_INTEGER;
+      TypeVals.push_back(cast<IntegerType>(T)->getBitWidth());
+      break;
+      case Type::TypedPointerTyID: {
+          TypedPointerType *PTy = cast<TypedPointerType>(T);
+          // POINTER: [pointee type, address space]
+          Code = bitc::TYPE_CODE_POINTER;
+          TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
+          unsigned AddressSpace = PTy->getAddressSpace();
+          TypeVals.push_back(AddressSpace);
+          if (AddressSpace == 0)
+            AbbrevToUse = PtrAbbrev;
+          break;
+        }
+    case Type::PointerTyID: {
+      PointerType *PTy = cast<PointerType>(T);
+      Code = bitc::TYPE_CODE_POINTER;
+      // opaque pointers are unsupported, so emit using an opaque element type
+      auto ET = StructType::get(PTy->getContext());
+      TypeVals.push_back(VE.getTypeID(ET));
+      unsigned AddressSpace = PTy->getAddressSpace();
+      TypeVals.push_back(AddressSpace);
+      if (AddressSpace == 0)
+        AbbrevToUse = PtrAbbrev;
+      break;
+    }
+    case Type::FunctionTyID: {
+      FunctionType *FT = cast<FunctionType>(T);
+      // FUNCTION: [isvararg, retty, paramty x N]
+      Code = bitc::TYPE_CODE_FUNCTION;
+      TypeVals.push_back(FT->isVarArg());
+      TypeVals.push_back(VE.getTypeID(FT->getReturnType()));
+      for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i)
+        TypeVals.push_back(VE.getTypeID(FT->getParamType(i)));
+      AbbrevToUse = FunctionAbbrev;
+      break;
+    }
+    case Type::StructTyID: {
+      StructType *ST = cast<StructType>(T);
+      // STRUCT: [ispacked, eltty x N]
+      TypeVals.push_back(ST->isPacked());
+      // Output all of the element types.
+      for (Type *ET : ST->elements())
+        TypeVals.push_back(VE.getTypeID(ET));
+
+      if (ST->isLiteral()) {
+        Code = bitc::TYPE_CODE_STRUCT_ANON;
+        AbbrevToUse = StructAnonAbbrev;
+      } else {
+        if (ST->isOpaque()) {
+          Code = bitc::TYPE_CODE_OPAQUE;
+        } else {
+          Code = bitc::TYPE_CODE_STRUCT_NAMED;
+          AbbrevToUse = StructNamedAbbrev;
+        }
+
+        // Emit the name if it is present.
+        if (!ST->getName().empty())
+          writeStringRecord(Stream, bitc::TYPE_CODE_STRUCT_NAME, ST->getName(),
+                            StructNameAbbrev);
+      }
+      break;
+    }
+    case Type::ArrayTyID: {
+      ArrayType *AT = cast<ArrayType>(T);
+      // ARRAY: [numelts, eltty]
+      Code = bitc::TYPE_CODE_ARRAY;
+      TypeVals.push_back(AT->getNumElements());
+      TypeVals.push_back(VE.getTypeID(AT->getElementType()));
+      AbbrevToUse = ArrayAbbrev;
+      break;
+    }
+    case Type::FixedVectorTyID:
+    case Type::ScalableVectorTyID: {
+      VectorType *VT = cast<VectorType>(T);
+      // VECTOR [numelts, eltty] or
+      //        [numelts, eltty, scalable]
+      Code = bitc::TYPE_CODE_VECTOR;
+      TypeVals.push_back(VT->getElementCount().getKnownMinValue());
+      TypeVals.push_back(VE.getTypeID(VT->getElementType()));
+      if (isa<ScalableVectorType>(VT))
+        TypeVals.push_back(true);
+      break;
+    }
+    case Type::TargetExtTyID: {
+      TargetExtType *TET = cast<TargetExtType>(T);
+      Code = bitc::TYPE_CODE_TARGET_TYPE;
+      writeStringRecord(Stream, bitc::TYPE_CODE_STRUCT_NAME, TET->getName(),
+                        StructNameAbbrev);
+      TypeVals.push_back(TET->getNumTypeParameters());
+      for (Type *InnerTy : TET->type_params())
+        TypeVals.push_back(VE.getTypeID(InnerTy));
+      llvm::append_range(TypeVals, TET->int_params());
+      break;
+    }
+
+    }
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, TypeVals, AbbrevToUse);
+    TypeVals.clear();
+  }
+
+  Stream.ExitBlock();
+
+
+}
+
+
+void NonOpaqueTypeModuleWriter::writeFunction(const Function &F, DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) {
+// Save the bitcode index of the start of this function block for recording
+// in the VST.
+FunctionToBitcodeIndex[&F] = Stream.GetCurrentBitNo();
+
+Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 5);
+VE.incorporateFunction(F);
+
+SmallVector<unsigned, 64> Vals;
+
+// Emit the number of basic blocks, so the reader can create them ahead of
+// time.
+Vals.push_back(VE.getBasicBlocks().size());
+Stream.EmitRecord(bitc::FUNC_CODE_DECLAREBLOCKS, Vals);
+Vals.clear();
+
+// If there are function-local constants, emit them now.
+unsigned CstStart, CstEnd;
+VE.getFunctionConstantRange(CstStart, CstEnd);
+writeConstants(CstStart, CstEnd, false);
+
+// If there is function-local metadata, emit it now.
+writeFunctionMetadata(F);
+
+// Keep a running idea of what the instruction ID is.
+unsigned InstID = CstEnd;
+
+bool NeedsMetadataAttachment = F.hasMetadata();
+
+DILocation *LastDL = nullptr;
+SmallSetVector<Function *, 4> BlockAddressUsers;
+
+// Finally, emit all the instructions, in order.
+for (const BasicBlock &BB : F) {
+for (const Instruction &I : BB) {
+  writeInstruction(I, InstID, Vals);
+
+  if (!I.getType()->isVoidTy())
+    ++InstID;
+
+  // If the instruction has metadata, write a metadata attachment later.
+  NeedsMetadataAttachment |= I.hasMetadataOtherThanDebugLoc();
+
+  // If the instruction has a debug location, emit it.
+  if (DILocation *DL = I.getDebugLoc()) {
+    if (DL == LastDL) {
+      // Just repeat the same debug loc as last time.
+      Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
+    } else {
+      Vals.push_back(DL->getLine());
+      Vals.push_back(DL->getColumn());
+      Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
+      Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
+      Vals.push_back(DL->isImplicitCode());
+      Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+      Vals.clear();
+      LastDL = DL;
+    }
+  }
+
+  // If the instruction has DbgRecords attached to it, emit them. Note that
+  // they come after the instruction so that it's easy to attach them again
+  // when reading the bitcode, even though conceptually the debug locations
+  // start "before" the instruction.
+  if (I.hasDbgRecords()) {
+    /// Try to push the value only (unwrapped), otherwise push the
+    /// metadata wrapped value. Returns true if the value was pushed
+    /// without the ValueAsMetadata wrapper.
+    auto PushValueOrMetadata = [&Vals, InstID,
+                                this](Metadata *RawLocation) {
+      assert(RawLocation &&
+             "RawLocation unexpectedly null in DbgVariableRecord");
+      if (ValueAsMetadata *VAM = dyn_cast<ValueAsMetadata>(RawLocation)) {
+        SmallVector<unsigned, 2> ValAndType;
+        // If the value is a fwd-ref the type is also pushed. We don't
+        // want the type, so fwd-refs are kept wrapped (pushValueAndType
+        // returns false if the value is pushed without type).
+        if (!pushValueAndType(VAM->getValue(), InstID, ValAndType)) {
+          Vals.push_back(ValAndType[0]);
+          return true;
+        }
+      }
+      // The metadata is a DIArgList, or ValueAsMetadata wrapping a
+      // fwd-ref. Push the metadata ID.
+      Vals.push_back(VE.getMetadataID(RawLocation));
+      return false;
+    };
+
+    // Write out non-instruction debug information attached to this
+    // instruction. Write it after the instruction so that it's easy to
+    // re-attach to the instruction reading the records in.
+    for (DbgRecord &DR : I.DebugMarker->getDbgRecordRange()) {
+      if (DbgLabelRecord *DLR = dyn_cast<DbgLabelRecord>(&DR)) {
+        Vals.push_back(VE.getMetadataID(&*DLR->getDebugLoc()));
+        Vals.push_back(VE.getMetadataID(DLR->getLabel()));
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_LABEL, Vals);
+        Vals.clear();
+        continue;
+      }
+
+      // First 3 fields are common to all kinds:
+      //   DILocation, DILocalVariable, DIExpression
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE)
+      //   ..., LocationMetadata
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE - abbrev'd)
+      //   ..., Value
+      // dbg_declare (FUNC_CODE_DEBUG_RECORD_DECLARE)
+      //   ..., LocationMetadata
+      // dbg_assign (FUNC_CODE_DEBUG_RECORD_ASSIGN)
+      //   ..., LocationMetadata, DIAssignID, DIExpression, LocationMetadata
+      DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+      Vals.push_back(VE.getMetadataID(&*DVR.getDebugLoc()));
+      Vals.push_back(VE.getMetadataID(DVR.getVariable()));
+      Vals.push_back(VE.getMetadataID(DVR.getExpression()));
+      if (DVR.isDbgValue()) {
+        if (PushValueOrMetadata(DVR.getRawLocation()))
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE, Vals,
+                            FUNCTION_DEBUG_RECORD_VALUE_ABBREV);
+        else
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE, Vals);
+      } else if (DVR.isDbgDeclare()) {
+        Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_DECLARE, Vals);
+      } else {
+        assert(DVR.isDbgAssign() && "Unexpected DbgRecord kind");
+        Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+        Vals.push_back(VE.getMetadataID(DVR.getAssignID()));
+        Vals.push_back(VE.getMetadataID(DVR.getAddressExpression()));
+        Vals.push_back(VE.getMetadataID(DVR.getRawAddress()));
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN, Vals);
+      }
+      Vals.clear();
+    }
+  }
+}
+
+if (BlockAddress *BA = BlockAddress::lookup(&BB)) {
+  SmallVector<Value *> Worklist{BA};
+  SmallPtrSet<Value *, 8> Visited{BA};
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    for (User *U : V->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        Function *P = I->getFunction();
+        if (P != &F)
+          BlockAddressUsers.insert(P);
+      } else if (isa<Constant>(U) && !isa<GlobalValue>(U) &&
+                 Visited.insert(U).second)
+        Worklist.push_back(U);
+    }
+  }
+}
+}
+
+if (!BlockAddressUsers.empty()) {
+Vals.resize(BlockAddressUsers.size());
+for (auto I : llvm::enumerate(BlockAddressUsers))
+  Vals[I.index()] = VE.getValueID(I.value());
+Stream.EmitRecord(bitc::FUNC_CODE_BLOCKADDR_USERS, Vals);
+Vals.clear();
+}
+
+// Emit names for all the instructions etc.
+if (auto *Symtab = F.getValueSymbolTable())
+writeFunctionLevelValueSymbolTable(*Symtab);
+
+if (NeedsMetadataAttachment)
+writeFunctionMetadataAttachment(F);
+if (VE.shouldPreserveUseListOrder())
+writeUseListBlock(&F);
+VE.purgeFunction();
+Stream.ExitBlock();
+}
+
+void NonOpaqueTypeModuleWriter::writeInstruction(const Instruction &I,
+                                                 unsigned InstID,
+                                                 SmallVectorImpl<unsigned> &Vals) {
+  unsigned Code = 0;
+  unsigned AbbrevToUse = 0;
+  VE.setInstructionID(&I);
+  switch (I.getOpcode()) {
+  default:
+    if (Instruction::isCast(I.getOpcode())) {
+      Code = bitc::FUNC_CODE_INST_CAST;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = FUNCTION_INST_CAST_ABBREV;
+      // Use NonOpaqueTypeMap for pointer types if available
+      Type *DestType = I.getType();
+      if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(&I)) {
+        DestType = (*NonOpaqueTypeMap)[&I];
+      }
+      Vals.push_back(VE.getTypeID(DestType));
+      Vals.push_back(getEncodedCastOpcode(I.getOpcode()));
+      uint64_t Flags = getOptimizationFlags(&I);
+      if (Flags != 0) {
+        if (AbbrevToUse == FUNCTION_INST_CAST_ABBREV)
+          AbbrevToUse = FUNCTION_INST_CAST_FLAGS_ABBREV;
+        Vals.push_back(Flags);
+      }
+    } else {
+      assert(isa<BinaryOperator>(I) && "Unknown instruction!");
+      Code = bitc::FUNC_CODE_INST_BINOP;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = FUNCTION_INST_BINOP_ABBREV;
+      pushValue(I.getOperand(1), InstID, Vals);
+      Vals.push_back(getEncodedBinaryOpcode(I.getOpcode()));
+      uint64_t Flags = getOptimizationFlags(&I);
+      if (Flags != 0) {
+        if (AbbrevToUse == FUNCTION_INST_BINOP_ABBREV)
+          AbbrevToUse = FUNCTION_INST_BINOP_FLAGS_ABBREV;
+        Vals.push_back(Flags);
+      }
+    }
+    break;
+  case Instruction::FNeg: {
+    Code = bitc::FUNC_CODE_INST_UNOP;
+    if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+      AbbrevToUse = FUNCTION_INST_UNOP_ABBREV;
+    Vals.push_back(getEncodedUnaryOpcode(I.getOpcode()));
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0) {
+      if (AbbrevToUse == FUNCTION_INST_UNOP_ABBREV)
+        AbbrevToUse = FUNCTION_INST_UNOP_FLAGS_ABBREV;
+      Vals.push_back(Flags);
+    }
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Code = bitc::FUNC_CODE_INST_GEP;
+    AbbrevToUse = FUNCTION_INST_GEP_ABBREV;
+    auto &GEPInst = cast<GetElementPtrInst>(I);
+    Vals.push_back(getOptimizationFlags(&I));
+    Vals.push_back(VE.getTypeID(GEPInst.getSourceElementType()));
+    for (const Value *Op : I.operands())
+      pushValueAndType(Op, InstID, Vals);
+    break;
+  }
+  case Instruction::ExtractValue: {
+    Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
+    Vals.append(EVI->idx_begin(), EVI->idx_end());
+    break;
+  }
+  case Instruction::InsertValue: {
+    Code = bitc::FUNC_CODE_INST_INSERTVAL;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    const InsertValueInst *IVI = cast<InsertValueInst>(&I);
+    Vals.append(IVI->idx_begin(), IVI->idx_end());
+    break;
+  }
+  case Instruction::Select: {
+    Code = bitc::FUNC_CODE_INST_VSELECT;
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    pushValue(I.getOperand(2), InstID, Vals);
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0)
+      Vals.push_back(Flags);
+    break;
+  }
+  case Instruction::ExtractElement:
+    Code = bitc::FUNC_CODE_INST_EXTRACTELT;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValueAndType(I.getOperand(1), InstID, Vals);
+    break;
+  case Instruction::InsertElement:
+    Code = bitc::FUNC_CODE_INST_INSERTELT;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValueAndType(I.getOperand(2), InstID, Vals);
+    break;
+  case Instruction::ShuffleVector:
+    Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    pushValue(I.getOperand(1), InstID, Vals);
+    pushValue(cast<ShuffleVectorInst>(I).getShuffleMaskForBitcode(), InstID,
+              Vals);
+    break;
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // compare returning Int1Ty or vector of Int1Ty
+    Code = bitc::FUNC_CODE_INST_CMP2;
+    AbbrevToUse = FUNCTION_INST_CMP_ABBREV;
+    if (pushValueAndType(I.getOperand(0), InstID, Vals))
+      AbbrevToUse = 0;
+    pushValue(I.getOperand(1), InstID, Vals);
+    Vals.push_back(cast<CmpInst>(I).getPredicate());
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0) {
+      Vals.push_back(Flags);
+      if (AbbrevToUse)
+        AbbrevToUse = FUNCTION_INST_CMP_FLAGS_ABBREV;
+    }
+    break;
+  }
+
+  case Instruction::Ret:
+    {
+      Code = bitc::FUNC_CODE_INST_RET;
+      unsigned NumOperands = I.getNumOperands();
+      if (NumOperands == 0)
+        AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
+      else if (NumOperands == 1) {
+        if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+          AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
+      } else {
+        for (const Value *Op : I.operands())
+          pushValueAndType(Op, InstID, Vals);
+      }
+    }
+    break;
+  case Instruction::Br:
+    {
+      Code = bitc::FUNC_CODE_INST_BR;
+      AbbrevToUse = FUNCTION_INST_BR_UNCOND_ABBREV;
+      const BranchInst &II = cast<BranchInst>(I);
+      Vals.push_back(VE.getValueID(II.getSuccessor(0)));
+      if (II.isConditional()) {
+        Vals.push_back(VE.getValueID(II.getSuccessor(1)));
+        pushValue(II.getCondition(), InstID, Vals);
+        AbbrevToUse = FUNCTION_INST_BR_COND_ABBREV;
+      }
+    }
+    break;
+  case Instruction::Switch:
+    {
+      Code = bitc::FUNC_CODE_INST_SWITCH;
+      const SwitchInst &SI = cast<SwitchInst>(I);
+      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      pushValue(SI.getCondition(), InstID, Vals);
+      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
+      for (auto Case : SI.cases()) {
+        Vals.push_back(VE.getValueID(Case.getCaseValue()));
+        Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
+      }
+    }
+    break;
+  case Instruction::IndirectBr:
+    Code = bitc::FUNC_CODE_INST_INDIRECTBR;
+    // Use NonOpaqueTypeMap for operand type if available
+    if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(I.getOperand(0))) {
+      Vals.push_back(VE.getTypeID((*NonOpaqueTypeMap)[I.getOperand(0)]));
+    } else {
+      Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
+    }
+    // Encode the address operand as relative, but not the basic blocks.
+    pushValue(I.getOperand(0), InstID, Vals);
+    for (const Value *Op : drop_begin(I.operands()))
+      Vals.push_back(VE.getValueID(Op));
+    break;
+
+  case Instruction::Invoke: {
+    const InvokeInst *II = cast<InvokeInst>(&I);
+    const Value *Callee = II->getCalledOperand();
+    FunctionType *FTy = II->getFunctionType();
+
+    if (II->hasOperandBundles())
+      writeOperandBundles(*II, InstID);
+
+    Code = bitc::FUNC_CODE_INST_INVOKE;
+
+    Vals.push_back(VE.getAttributeListID(II->getAttributes()));
+    Vals.push_back(II->getCallingConv() | 1 << 13);
+    Vals.push_back(VE.getValueID(II->getNormalDest()));
+    Vals.push_back(VE.getValueID(II->getUnwindDest()));
+    Vals.push_back(VE.getTypeID(FTy));
+    pushValueAndType(Callee, InstID, Vals);
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      pushValue(I.getOperand(i), InstID, Vals); // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = II->arg_size(); i != e; ++i)
+        pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
+    }
+    break;
+  }
+  case Instruction::Resume:
+    Code = bitc::FUNC_CODE_INST_RESUME;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    break;
+  case Instruction::CleanupRet: {
+    Code = bitc::FUNC_CODE_INST_CLEANUPRET;
+    const auto &CRI = cast<CleanupReturnInst>(I);
+    pushValue(CRI.getCleanupPad(), InstID, Vals);
+    if (CRI.hasUnwindDest())
+      Vals.push_back(VE.getValueID(CRI.getUnwindDest()));
+    break;
+  }
+  case Instruction::CatchRet: {
+    Code = bitc::FUNC_CODE_INST_CATCHRET;
+    const auto &CRI = cast<CatchReturnInst>(I);
+    pushValue(CRI.getCatchPad(), InstID, Vals);
+    Vals.push_back(VE.getValueID(CRI.getSuccessor()));
+    break;
+  }
+  case Instruction::CleanupPad:
+  case Instruction::CatchPad: {
+    const auto &FuncletPad = cast<FuncletPadInst>(I);
+    Code = isa<CatchPadInst>(FuncletPad) ? bitc::FUNC_CODE_INST_CATCHPAD
+                                         : bitc::FUNC_CODE_INST_CLEANUPPAD;
+    pushValue(FuncletPad.getParentPad(), InstID, Vals);
+
+    unsigned NumArgOperands = FuncletPad.arg_size();
+    Vals.push_back(NumArgOperands);
+    for (unsigned Op = 0; Op != NumArgOperands; ++Op)
+      pushValueAndType(FuncletPad.getArgOperand(Op), InstID, Vals);
+    break;
+  }
+  case Instruction::CatchSwitch: {
+    Code = bitc::FUNC_CODE_INST_CATCHSWITCH;
+    const auto &CatchSwitch = cast<CatchSwitchInst>(I);
+
+    pushValue(CatchSwitch.getParentPad(), InstID, Vals);
+
+    unsigned NumHandlers = CatchSwitch.getNumHandlers();
+    Vals.push_back(NumHandlers);
+    for (const BasicBlock *CatchPadBB : CatchSwitch.handlers())
+      Vals.push_back(VE.getValueID(CatchPadBB));
+
+    if (CatchSwitch.hasUnwindDest())
+      Vals.push_back(VE.getValueID(CatchSwitch.getUnwindDest()));
+    break;
+  }
+  case Instruction::CallBr: {
+    const CallBrInst *CBI = cast<CallBrInst>(&I);
+    const Value *Callee = CBI->getCalledOperand();
+    FunctionType *FTy = CBI->getFunctionType();
+
+    if (CBI->hasOperandBundles())
+      writeOperandBundles(*CBI, InstID);
+
+    Code = bitc::FUNC_CODE_INST_CALLBR;
+
+    Vals.push_back(VE.getAttributeListID(CBI->getAttributes()));
+
+    Vals.push_back(CBI->getCallingConv() << bitc::CALL_CCONV |
+                   1 << bitc::CALL_EXPLICIT_TYPE);
+
+    Vals.push_back(VE.getValueID(CBI->getDefaultDest()));
+    Vals.push_back(CBI->getNumIndirectDests());
+    for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i)
+      Vals.push_back(VE.getValueID(CBI->getIndirectDest(i)));
+
+    Vals.push_back(VE.getTypeID(FTy));
+    pushValueAndType(Callee, InstID, Vals);
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      pushValue(I.getOperand(i), InstID, Vals); // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = CBI->arg_size(); i != e; ++i)
+        pushValueAndType(I.getOperand(i), InstID, Vals); // vararg
+    }
+    break;
+  }
+  case Instruction::Unreachable:
+    Code = bitc::FUNC_CODE_INST_UNREACHABLE;
+    AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
+    break;
+
+  case Instruction::PHI: {
+    const PHINode &PN = cast<PHINode>(I);
+    Code = bitc::FUNC_CODE_INST_PHI;
+    // With the newer instruction encoding, forward references could give
+    // negative valued IDs.  This is most common for PHIs, so we use
+    // signed VBRs.
+    SmallVector<uint64_t, 128> Vals64;
+    // Use NonOpaqueTypeMap for PHI type if available
+    Type *PHIType = PN.getType();
+    if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(&I)) {
+      PHIType = (*NonOpaqueTypeMap)[&I];
+    }
+    Vals64.push_back(VE.getTypeID(PHIType));
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64);
+      Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
+    }
+
+    uint64_t Flags = getOptimizationFlags(&I);
+    if (Flags != 0)
+      Vals64.push_back(Flags);
+
+    // Emit a Vals64 vector and exit.
+    Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+    Vals64.clear();
+    return;
+  }
+
+  case Instruction::LandingPad: {
+    const LandingPadInst &LP = cast<LandingPadInst>(I);
+    Code = bitc::FUNC_CODE_INST_LANDINGPAD;
+    // Use NonOpaqueTypeMap for landing pad type if available
+    Type *LPType = LP.getType();
+    if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(&I)) {
+      LPType = (*NonOpaqueTypeMap)[&I];
+    }
+    Vals.push_back(VE.getTypeID(LPType));
+    Vals.push_back(LP.isCleanup());
+    Vals.push_back(LP.getNumClauses());
+    for (unsigned I = 0, E = LP.getNumClauses(); I != E; ++I) {
+      if (LP.isCatch(I))
+        Vals.push_back(LandingPadInst::Catch);
+      else
+        Vals.push_back(LandingPadInst::Filter);
+      pushValueAndType(LP.getClause(I), InstID, Vals);
+    }
+    break;
+  }
+
+  case Instruction::Alloca: {
+    Code = bitc::FUNC_CODE_INST_ALLOCA;
+    const AllocaInst &AI = cast<AllocaInst>(I);
+    Vals.push_back(VE.getTypeID(AI.getAllocatedType()));
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
+    using APV = AllocaPackedValues;
+    unsigned Record = 0;
+    unsigned EncodedAlign = getEncodedAlign(AI.getAlign());
+    Bitfield::set<APV::AlignLower>(
+        Record, EncodedAlign & ((1 << APV::AlignLower::Bits) - 1));
+    Bitfield::set<APV::AlignUpper>(Record,
+                                   EncodedAlign >> APV::AlignLower::Bits);
+    Bitfield::set<APV::UsedWithInAlloca>(Record, AI.isUsedWithInAlloca());
+    Bitfield::set<APV::ExplicitType>(Record, true);
+    Bitfield::set<APV::SwiftError>(Record, AI.isSwiftError());
+    Vals.push_back(Record);
+
+    unsigned AS = AI.getAddressSpace();
+    if (AS != M.getDataLayout().getAllocaAddrSpace())
+      Vals.push_back(AS);
+    break;
+  }
+
+  case Instruction::Load: {
+    if (cast<LoadInst>(I).isAtomic()) {
+      Code = bitc::FUNC_CODE_INST_LOADATOMIC;
+      pushValueAndType(I.getOperand(0), InstID, Vals);
+    } else {
+      Code = bitc::FUNC_CODE_INST_LOAD;
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals)) // ptr
+        AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
+    }
+    // Use NonOpaqueTypeMap for loaded type if available
+    Type *LoadedType = I.getType();
+    if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(&I)) {
+      LoadedType = (*NonOpaqueTypeMap)[&I];
+    }
+    Vals.push_back(VE.getTypeID(LoadedType));
+    Vals.push_back(getEncodedAlign(cast<LoadInst>(I).getAlign()));
+    Vals.push_back(cast<LoadInst>(I).isVolatile());
+    if (cast<LoadInst>(I).isAtomic()) {
+      Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
+      Vals.push_back(getEncodedSyncScopeID(cast<LoadInst>(I).getSyncScopeID()));
+    }
+    break;
+  }
+  case Instruction::Store: {
+    if (cast<StoreInst>(I).isAtomic()) {
+      Code = bitc::FUNC_CODE_INST_STOREATOMIC;
+    } else {
+      Code = bitc::FUNC_CODE_INST_STORE;
+      AbbrevToUse = FUNCTION_INST_STORE_ABBREV;
+    }
+    if (pushValueAndType(I.getOperand(1), InstID, Vals)) // ptrty + ptr
+      AbbrevToUse = 0;
+    if (pushValueAndType(I.getOperand(0), InstID, Vals)) // valty + val
+      AbbrevToUse = 0;
+    Vals.push_back(getEncodedAlign(cast<StoreInst>(I).getAlign()));
+    Vals.push_back(cast<StoreInst>(I).isVolatile());
+    if (cast<StoreInst>(I).isAtomic()) {
+      Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
+      Vals.push_back(
+          getEncodedSyncScopeID(cast<StoreInst>(I).getSyncScopeID()));
+    }
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    Code = bitc::FUNC_CODE_INST_CMPXCHG;
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(1), InstID, Vals); // cmp.
+    pushValue(I.getOperand(2), InstID, Vals);        // newval.
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getSuccessOrdering()));
+    Vals.push_back(
+        getEncodedSyncScopeID(cast<AtomicCmpXchgInst>(I).getSyncScopeID()));
+    Vals.push_back(
+        getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
+    Vals.push_back(getEncodedAlign(cast<AtomicCmpXchgInst>(I).getAlign()));
+    break;
+  }
+  case Instruction::AtomicRMW: {
+    Code = bitc::FUNC_CODE_INST_ATOMICRMW;
+    pushValueAndType(I.getOperand(0), InstID, Vals); // ptrty + ptr
+    pushValueAndType(I.getOperand(1), InstID, Vals); // valty + val
+    Vals.push_back(
+        getEncodedRMWOperation(cast<AtomicRMWInst>(I).getOperation()));
+    Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
+    Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
+    Vals.push_back(
+        getEncodedSyncScopeID(cast<AtomicRMWInst>(I).getSyncScopeID()));
+    Vals.push_back(getEncodedAlign(cast<AtomicRMWInst>(I).getAlign()));
+    break;
+  }
+  case Instruction::Fence: {
+    Code = bitc::FUNC_CODE_INST_FENCE;
+    const FenceInst &FI = cast<FenceInst>(I);
+    Vals.push_back(getEncodedOrdering(FI.getOrdering()));
+    Vals.push_back(getEncodedSyncScopeID(FI.getSyncScopeID()));
+    break;
+  }
+  case Instruction::Call: {
+    const CallInst &CI = cast<CallInst>(I);
+    FunctionType *FTy = CI.getFunctionType();
+
+    if (CI.hasOperandBundles())
+      writeOperandBundles(CI, InstID);
+
+    Code = bitc::FUNC_CODE_INST_CALL;
+
+    Vals.push_back(VE.getAttributeListID(CI.getAttributes()));
+
+    unsigned Flags = getOptimizationFlags(&I);
+    Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV |
+                   unsigned(CI.isTailCall()) << bitc::CALL_TAIL |
+                   unsigned(CI.isMustTailCall()) << bitc::CALL_MUSTTAIL |
+                   1 << bitc::CALL_EXPLICIT_TYPE |
+                   unsigned(CI.isNoTailCall()) << bitc::CALL_NOTAIL |
+                   unsigned(Flags != 0) << bitc::CALL_FMF);
+    if (Flags != 0)
+      Vals.push_back(Flags);
+
+    Vals.push_back(VE.getTypeID(FTy));
+    pushValueAndType(CI.getCalledOperand(), InstID, Vals); // Callee
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      pushValue(CI.getArgOperand(i), InstID, Vals); // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = CI.arg_size(); i != e; ++i)
+        pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs
+    }
+    break;
+  }
+  case Instruction::VAArg: {
+    Code = bitc::FUNC_CODE_INST_VAARG;
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
+    pushValue(I.getOperand(0), InstID, Vals);                   // valist.
+    // Use NonOpaqueTypeMap for VAArg type if available
+    Type *VAArgType = I.getType();
+    if (NonOpaqueTypeMap && NonOpaqueTypeMap->count(&I)) {
+      VAArgType = (*NonOpaqueTypeMap)[&I];
+    }
+    Vals.push_back(VE.getTypeID(VAArgType)); // restype.
+    break;
+  }
+  case Instruction::Freeze: {
+    Code = bitc::FUNC_CODE_INST_FREEZE;
+    pushValueAndType(I.getOperand(0), InstID, Vals);
+    break;
+  }
+  }
+
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+  Vals.clear();
+}
+
+
+void NonOpaqueTypeModuleWriter::write() {
+  writeIdentificationBlock(Stream);
+
+  Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+  // We will want to write the module hash at this point. Block any flushing so
+  // we can have access to the whole underlying data later.
+  Stream.markAndBlockFlushing();
+
+  writeModuleVersion();
+
+  // Emit blockinfo, which defines the standard abbreviations etc.
+  writeBlockInfo();
+
+  // Emit information describing all of the types in the module.
+  NonOpaqueTypeModuleWriter::writeTypeTable();
+
+  // Emit information about attribute groups.
+  writeAttributeGroupTable();
+
+  // Emit information about parameter attributes.
+  writeAttributeTable();
+
+  writeComdats();
+
+  // Emit top-level description of module, including target triple, inline asm,
+  // descriptors for global variables, and function prototype info.
+  writeModuleInfo();
+
+  // Emit constants.
+  writeModuleConstants();
+
+  // Emit metadata kind names.
+  writeModuleMetadataKinds();
+
+  // Emit metadata.
+  writeModuleMetadata();
+
+  // Emit module-level use-lists.
+  if (VE.shouldPreserveUseListOrder())
+    writeUseListBlock(nullptr);
+
+  writeOperandBundleTags();
+  writeSyncScopeNames();
+
+  // Emit function bodies.
+  DenseMap<const Function *, uint64_t> FunctionToBitcodeIndex;
+  for (const Function &F : M)
+    if (!F.isDeclaration())
+      writeFunction(F, FunctionToBitcodeIndex);
+
+  // Need to write after the above call to WriteFunction which populates
+  // the summary information in the index.
+  if (Index)
+    writePerModuleGlobalValueSummary();
+
+  writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
+
+  writeModuleHash(Stream.getMarkedBufferAndResumeFlushing());
+
+  Stream.ExitBlock();
+}
+
+
+void BitcodeWriter::writeBitcodeWithNonOpaqueTypes(const Module &M,
+                                                   bool ShouldPreserveUseListOrder,
+                                                   const ModuleSummaryIndex *Index,
+                                                   bool GenerateHash,
+                                                   ModuleHash *ModHash,
+                                                   bool WriteNonOpaqueTypes,
+                                                   DenseMap<const Value *, Type *> *NonOpaqueTypeMap) {
+  assert(!WroteStrtab);
+
+  assert(M.isMaterialized());
+  Mods.push_back(const_cast<Module *>(&M));
+  NonOpaqueTypeModuleWriter NonOpaqueTypeModuleWriter(M, StrtabBuilder, *Stream,
+                                          ShouldPreserveUseListOrder, *Index, 
+                                          GenerateHash, *ModHash, NonOpaqueTypeMap);
+  NonOpaqueTypeModuleWriter.write();
+}
+
 /// Write the specified module to the specified output stream.
 void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index b58c60d1db0a9..fd7df6b872fd9 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -41,6 +41,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/UniqueBBID.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index 1eedfc4b25912..e317e1c06741f 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -79,6 +79,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/UniqueBBID.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index fa54640265162..7baeb3fd7bcee 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/UniqueBBID.h"
 #include <llvm/ADT/STLExtras.h>
 
 using namespace llvm;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index ef39fc74554c9..d7280eaba2440 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2593,6 +2593,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::reset_fpmode:
     MIRBuilder.buildResetFPMode();
     return true;
+  case Intrinsic::get_rounding:
+    MIRBuilder.buildGetRounding(getOrCreateVReg(CI));
+    return true;
   case Intrinsic::vscale: {
     MIRBuilder.buildVScale(getOrCreateVReg(CI), 1);
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index cdf192f9e7e3a..11b3ac82e5136 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -9272,7 +9272,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
   APInt QNaNBitMask =
       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
-  APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
+  APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
 
   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
@@ -9400,7 +9400,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
     else if (PartialCheck == fcPosNormal) {
       auto PosSign = MIRBuilder.buildXor(
-          DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
+          DstTy, Sign, MIRBuilder.buildConstant(DstTy, InversionMask));
       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
     }
     appendToRes(NormalRes);
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d2c79f64afe64..b38a4d1c55af9 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -338,6 +338,17 @@ class LoopCarriedOrderDepsTracker {
   void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
                                           const LoadStoreChunk &To);
 
+  /// Add a loop-carried order dependency between \p Src and \p Dst if we
+  /// cannot prove they are independent. When \p PerformCheapCheck is true, a
+  /// lightweight dependency test (referred to as "cheap check" below) is
+  /// performed at first. Note that the cheap check is retained to maintain the
+  /// existing behavior and not expected to be used anymore.
+  ///
+  /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
+  void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
+                                 const SUnitWithMemInfo &Dst,
+                                 bool PerformCheapCheck = false);
+
   void computeDependenciesAux();
 };
 
@@ -673,7 +684,7 @@ void SwingSchedulerDAG::schedule() {
   Topo.InitDAGTopologicalSorting();
   changeDependences();
   postProcessDAG();
-  DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
+  DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
   LLVM_DEBUG({
     dump();
     dbgs() << "===== Loop Carried Edges Begin =====\n";
@@ -958,11 +969,11 @@ bool SUnitWithMemInfo::getUnderlyingObjects() {
 
 /// Returns true if there is a loop-carried order dependency from \p Src to \p
 /// Dst.
-static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
-                                 const SUnitWithMemInfo &Dst,
-                                 BatchAAResults &BAA,
-                                 const TargetInstrInfo *TII,
-                                 const TargetRegisterInfo *TRI) {
+static bool
+hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
+                     BatchAAResults &BAA, const TargetInstrInfo *TII,
+                     const TargetRegisterInfo *TRI,
+                     const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
   if (Src.isTriviallyDisjoint(Dst))
     return false;
   if (isSuccOrder(Src.SU, Dst.SU))
@@ -970,24 +981,32 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
 
   MachineInstr &SrcMI = *Src.SU->getInstr();
   MachineInstr &DstMI = *Dst.SU->getInstr();
-  // First, perform the cheaper check that compares the base register.
-  // If they are the same and the load offset is less than the store
-  // offset, then mark the dependence as loop carried potentially.
-  const MachineOperand *BaseOp1, *BaseOp2;
-  int64_t Offset1, Offset2;
-  bool Offset1IsScalable, Offset2IsScalable;
-  if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
-                                   TRI) &&
-      TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
-                                   TRI)) {
-    if (BaseOp1->isIdenticalTo(*BaseOp2) &&
-        Offset1IsScalable == Offset2IsScalable && (int)Offset1 < (int)Offset2) {
-      assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
-             "What happened to the chain edge?");
-      return true;
+  if (PerformCheapCheck) {
+    // First, perform the cheaper check that compares the base register.
+    // If they are the same and the load offset is less than the store
+    // offset, then mark the dependence as loop carried potentially.
+    //
+    // TODO: This check will be removed.
+    const MachineOperand *BaseOp1, *BaseOp2;
+    int64_t Offset1, Offset2;
+    bool Offset1IsScalable, Offset2IsScalable;
+    if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
+                                     TRI) &&
+        TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
+                                     TRI)) {
+      if (BaseOp1->isIdenticalTo(*BaseOp2) &&
+          Offset1IsScalable == Offset2IsScalable &&
+          (int)Offset1 < (int)Offset2) {
+        assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
+               "What happened to the chain edge?");
+        return true;
+      }
     }
   }
 
+  if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
+    return false;
+
   // Second, the more expensive check that uses alias analysis on the
   // base registers. If they alias, and the load offset is less than
   // the store offset, the mark the dependence as loop carried.
@@ -1056,20 +1075,34 @@ LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
   return std::nullopt;
 }
 
+void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
+    const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
+    bool PerformCheapCheck) {
+  // Avoid self-dependencies.
+  if (Src.SU == Dst.SU)
+    return;
+
+  if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
+    LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
+}
+
 void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
     const LoadStoreChunk &From, const LoadStoreChunk &To) {
-  // Add dependencies for load-to-store (WAR) from top to bottom.
+  // Add load-to-store dependencies (WAR).
   for (const SUnitWithMemInfo &Src : From.Loads)
     for (const SUnitWithMemInfo &Dst : To.Stores)
-      if (Src.SU->NodeNum < Dst.SU->NodeNum &&
-          hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI))
-        LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
+      // Perform a cheap check first if this is a forward dependency.
+      addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
 
-  // TODO: The following dependencies are missed.
-  //
-  // - Dependencies for load-to-store from bottom to top.
-  // - Dependencies for store-to-load (RAW).
-  // - Dependencies for store-to-store (WAW).
+  // Add store-to-load dependencies (RAW).
+  for (const SUnitWithMemInfo &Src : From.Stores)
+    for (const SUnitWithMemInfo &Dst : To.Loads)
+      addDependenciesBetweenSUs(Src, Dst);
+
+  // Add store-to-store dependencies (WAW).
+  for (const SUnitWithMemInfo &Src : From.Stores)
+    for (const SUnitWithMemInfo &Dst : To.Stores)
+      addDependenciesBetweenSUs(Src, Dst);
 }
 
 void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
@@ -1116,7 +1149,7 @@ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
     for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
       LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
 
-  LCE.modifySUnits(SUnits);
+  LCE.modifySUnits(SUnits, TII);
   return LCE;
 }
 
@@ -2676,6 +2709,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       });
     } while (++NI != NE && scheduleFound);
 
+    // If a schedule is found, validate it against the validation-only
+    // dependencies.
+    if (scheduleFound)
+      scheduleFound = DDG->isValidSchedule(Schedule);
+
     // If a schedule is found, ensure non-pipelined instructions are in stage 0
     if (scheduleFound)
       scheduleFound =
@@ -4118,6 +4156,8 @@ SwingSchedulerDDG::getEdges(const SUnit *SU) const {
 
 void SwingSchedulerDDG::addEdge(const SUnit *SU,
                                 const SwingSchedulerDDGEdge &Edge) {
+  assert(!Edge.isValidationOnly() &&
+         "Validation-only edges are not expected here.");
   auto &Edges = getEdges(SU);
   if (Edge.getSrc() == SU)
     Edges.Succs.push_back(Edge);
@@ -4127,25 +4167,43 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
 
 void SwingSchedulerDDG::initEdges(SUnit *SU) {
   for (const auto &PI : SU->Preds) {
-    SwingSchedulerDDGEdge Edge(SU, PI, false);
+    SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
+                               /*IsValidationOnly=*/false);
     addEdge(SU, Edge);
   }
 
   for (const auto &SI : SU->Succs) {
-    SwingSchedulerDDGEdge Edge(SU, SI, true);
+    SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
+                               /*IsValidationOnly=*/false);
     addEdge(SU, Edge);
   }
 }
 
 SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
-                                     SUnit *ExitSU)
+                                     SUnit *ExitSU, const LoopCarriedEdges &LCE)
     : EntrySU(EntrySU), ExitSU(ExitSU) {
   EdgesVec.resize(SUnits.size());
 
+  // Add non-loop-carried edges based on the DAG.
   initEdges(EntrySU);
   initEdges(ExitSU);
   for (auto &SU : SUnits)
     initEdges(&SU);
+
+  // Add loop-carried edges, which are not represented in the DAG.
+  for (SUnit &SU : SUnits) {
+    SUnit *Src = &SU;
+    if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
+      SDep Base(Src, SDep::Barrier);
+      Base.setLatency(1);
+      for (SUnit *Dst : *OD) {
+        SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
+                                   /*IsValidationOnly=*/true);
+        Edge.setDistance(1);
+        ValidationOnlyEdges.push_back(Edge);
+      }
+    }
+  }
 }
 
 const SwingSchedulerDDG::EdgesType &
@@ -4158,17 +4216,73 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
   return getEdges(SU).Succs;
 }
 
-void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
-  // Currently this function simply adds all dependencies represented by this
-  // object. After we properly handle missed dependencies, the logic here will
-  // be more complex, as currently missed edges should not be added to the DAG.
+/// Check if \p Schedule doesn't violate the validation-only dependencies.
+bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {
+  unsigned II = Schedule.getInitiationInterval();
+
+  auto ExpandCycle = [&](SUnit *SU) {
+    int Stage = Schedule.stageScheduled(SU);
+    int Cycle = Schedule.cycleScheduled(SU);
+    return Cycle + (Stage * II);
+  };
+
+  for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
+    SUnit *Src = Edge.getSrc();
+    SUnit *Dst = Edge.getDst();
+    if (!Src->isInstr() || !Dst->isInstr())
+      continue;
+    int CycleSrc = ExpandCycle(Src);
+    int CycleDst = ExpandCycle(Dst);
+    int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
+    if (CycleSrc > MaxLateStart) {
+      LLVM_DEBUG({
+        dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
+               << Dst->NodeNum << "\n";
+      });
+      return false;
+    }
+  }
+  return true;
+}
+
+void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
+                                    const TargetInstrInfo *TII) {
   for (SUnit &SU : SUnits) {
     SUnit *Src = &SU;
     if (auto *OrderDep = getOrderDepOrNull(Src)) {
       SDep Dep(Src, SDep::Barrier);
       Dep.setLatency(1);
-      for (SUnit *Dst : *OrderDep)
-        Dst->addPred(Dep);
+      for (SUnit *Dst : *OrderDep) {
+        SUnit *From = Src;
+        SUnit *To = Dst;
+        if (From->NodeNum > To->NodeNum)
+          std::swap(From, To);
+
+        // Add a forward edge if the following conditions are met:
+        //
+        // - The instruction of the source node (FromMI) may read memory.
+        // - The instruction of the target node (ToMI) may modify memory, but
+        //   does not read it.
+        // - Neither instruction is a global barrier.
+        // - The load appears before the store in the original basic block.
+        // - There are no barrier or store instructions between the two nodes.
+        // - The target node is unreachable from the source node in the current
+        //   DAG.
+        //
+        // TODO: These conditions are inherited from a previous implementation,
+        // and some may no longer be necessary. For now, we conservatively
+        // retain all of them to avoid regressions, but the logic could
+        // potentially be simplified
+        MachineInstr *FromMI = From->getInstr();
+        MachineInstr *ToMI = To->getInstr();
+        if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
+            !TII->isGlobalMemoryObject(FromMI) &&
+            !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
+          SDep Pred = Dep;
+          Pred.setSUnit(Src);
+          Dst->addPred(Pred);
+        }
+      }
     }
   }
 }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 265a32cf4d127..8de2c48581a1e 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/InitializePasses.h"
@@ -135,17 +136,22 @@ static CallInst::TailCallKind getOverridingTailCallKind(const Function &F) {
   return CallInst::TCK_None;
 }
 
-static bool lowerObjCCall(Function &F, const char *NewFn,
+static bool lowerObjCCall(Function &F, RTLIB::LibcallImpl NewFn,
                           bool setNonLazyBind = false) {
   assert(IntrinsicInst::mayLowerToFunctionCall(F.getIntrinsicID()) &&
          "Pre-ISel intrinsics do lower into regular function calls");
   if (F.use_empty())
     return false;
 
+  // FIXME: When RuntimeLibcalls is an analysis, check if the function is really
+  // supported, and go through RTLIB::Libcall.
+  const char *NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
+
   // If we haven't already looked up this function, check to see if the
   // program already contains a function with this name.
   Module *M = F.getParent();
-  FunctionCallee FCache = M->getOrInsertFunction(NewFn, F.getFunctionType());
+  FunctionCallee FCache =
+      M->getOrInsertFunction(NewFnName, F.getFunctionType());
 
   if (Function *Fn = dyn_cast<Function>(FCache.getCallee())) {
     Fn->setLinkage(F.getLinkage());
@@ -501,82 +507,83 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
       });
       break;
     case Intrinsic::objc_autorelease:
-      Changed |= lowerObjCCall(F, "objc_autorelease");
+      Changed |= lowerObjCCall(F, RTLIB::objc_autorelease);
       break;
     case Intrinsic::objc_autoreleasePoolPop:
-      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPop");
+      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPop);
       break;
     case Intrinsic::objc_autoreleasePoolPush:
-      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPush");
+      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleasePoolPush);
       break;
     case Intrinsic::objc_autoreleaseReturnValue:
-      Changed |= lowerObjCCall(F, "objc_autoreleaseReturnValue");
+      Changed |= lowerObjCCall(F, RTLIB::objc_autoreleaseReturnValue);
       break;
     case Intrinsic::objc_copyWeak:
-      Changed |= lowerObjCCall(F, "objc_copyWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_copyWeak);
       break;
     case Intrinsic::objc_destroyWeak:
-      Changed |= lowerObjCCall(F, "objc_destroyWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_destroyWeak);
       break;
     case Intrinsic::objc_initWeak:
-      Changed |= lowerObjCCall(F, "objc_initWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_initWeak);
       break;
     case Intrinsic::objc_loadWeak:
-      Changed |= lowerObjCCall(F, "objc_loadWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_loadWeak);
       break;
     case Intrinsic::objc_loadWeakRetained:
-      Changed |= lowerObjCCall(F, "objc_loadWeakRetained");
+      Changed |= lowerObjCCall(F, RTLIB::objc_loadWeakRetained);
       break;
     case Intrinsic::objc_moveWeak:
-      Changed |= lowerObjCCall(F, "objc_moveWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_moveWeak);
       break;
     case Intrinsic::objc_release:
-      Changed |= lowerObjCCall(F, "objc_release", true);
+      Changed |= lowerObjCCall(F, RTLIB::objc_release, true);
       break;
     case Intrinsic::objc_retain:
-      Changed |= lowerObjCCall(F, "objc_retain", true);
+      Changed |= lowerObjCCall(F, RTLIB::objc_retain, true);
       break;
     case Intrinsic::objc_retainAutorelease:
-      Changed |= lowerObjCCall(F, "objc_retainAutorelease");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutorelease);
       break;
     case Intrinsic::objc_retainAutoreleaseReturnValue:
-      Changed |= lowerObjCCall(F, "objc_retainAutoreleaseReturnValue");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleaseReturnValue);
       break;
     case Intrinsic::objc_retainAutoreleasedReturnValue:
-      Changed |= lowerObjCCall(F, "objc_retainAutoreleasedReturnValue");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retainAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_claimAutoreleasedReturnValue:
-      Changed |= lowerObjCCall(F, "objc_claimAutoreleasedReturnValue");
+      Changed |= lowerObjCCall(F, RTLIB::objc_claimAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_retainBlock:
-      Changed |= lowerObjCCall(F, "objc_retainBlock");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retainBlock);
       break;
     case Intrinsic::objc_storeStrong:
-      Changed |= lowerObjCCall(F, "objc_storeStrong");
+      Changed |= lowerObjCCall(F, RTLIB::objc_storeStrong);
       break;
     case Intrinsic::objc_storeWeak:
-      Changed |= lowerObjCCall(F, "objc_storeWeak");
+      Changed |= lowerObjCCall(F, RTLIB::objc_storeWeak);
       break;
     case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
-      Changed |= lowerObjCCall(F, "objc_unsafeClaimAutoreleasedReturnValue");
+      Changed |=
+          lowerObjCCall(F, RTLIB::objc_unsafeClaimAutoreleasedReturnValue);
       break;
     case Intrinsic::objc_retainedObject:
-      Changed |= lowerObjCCall(F, "objc_retainedObject");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retainedObject);
       break;
     case Intrinsic::objc_unretainedObject:
-      Changed |= lowerObjCCall(F, "objc_unretainedObject");
+      Changed |= lowerObjCCall(F, RTLIB::objc_unretainedObject);
       break;
     case Intrinsic::objc_unretainedPointer:
-      Changed |= lowerObjCCall(F, "objc_unretainedPointer");
+      Changed |= lowerObjCCall(F, RTLIB::objc_unretainedPointer);
       break;
     case Intrinsic::objc_retain_autorelease:
-      Changed |= lowerObjCCall(F, "objc_retain_autorelease");
+      Changed |= lowerObjCCall(F, RTLIB::objc_retain_autorelease);
       break;
     case Intrinsic::objc_sync_enter:
-      Changed |= lowerObjCCall(F, "objc_sync_enter");
+      Changed |= lowerObjCCall(F, RTLIB::objc_sync_enter);
       break;
     case Intrinsic::objc_sync_exit:
-      Changed |= lowerObjCCall(F, "objc_sync_exit");
+      Changed |= lowerObjCCall(F, RTLIB::objc_sync_exit);
       break;
     case Intrinsic::exp:
     case Intrinsic::exp2:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9ffdda28f7899..231184587d682 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13091,10 +13091,10 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
   EVT CondVT = Cond.getValueType();
   assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
-  bool IsTAllZero = ISD::isBuildVectorAllZeros(TVal.getNode());
-  bool IsTAllOne = ISD::isBuildVectorAllOnes(TVal.getNode());
-  bool IsFAllZero = ISD::isBuildVectorAllZeros(FVal.getNode());
-  bool IsFAllOne = ISD::isBuildVectorAllOnes(FVal.getNode());
+  bool IsTAllZero = ISD::isConstantSplatVectorAllZeros(TVal.getNode());
+  bool IsTAllOne = ISD::isConstantSplatVectorAllOnes(TVal.getNode());
+  bool IsFAllZero = ISD::isConstantSplatVectorAllZeros(FVal.getNode());
+  bool IsFAllOne = ISD::isConstantSplatVectorAllOnes(FVal.getNode());
 
   // no vselect(cond, 0/-1, X) or vselect(cond, X, 0/-1), return
   if (!IsTAllZero && !IsTAllOne && !IsFAllZero && !IsFAllOne)
@@ -13194,8 +13194,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     return V;
 
   // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
-  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
-    return DAG.getSelect(DL, VT, F, N2, N1);
+  if (!TLI.isTargetCanonicalSelect(N))
+    if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+      return DAG.getSelect(DL, VT, F, N2, N1);
 
   // select (sext m), (add X, C), X --> (add X, (and C, (sext m))))
   if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N2 && N1->hasOneUse() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5a4cc466d2bce..58be4fb7e8331 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -10459,7 +10459,7 @@ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
 
   // select true, T, F --> T
   // select false, T, F --> F
-  if (auto C = isBoolConstant(Cond, /*AllowTruncation=*/true))
+  if (auto C = isBoolConstant(Cond))
     return *C ? T : F;
 
   // select ?, T, T --> T
@@ -13688,13 +13688,14 @@ bool SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
   return false;
 }
 
-std::optional<bool> SelectionDAG::isBoolConstant(SDValue N,
-                                                 bool AllowTruncation) const {
-  ConstantSDNode *Const = isConstOrConstSplat(N, false, AllowTruncation);
+std::optional<bool> SelectionDAG::isBoolConstant(SDValue N) const {
+  ConstantSDNode *Const =
+      isConstOrConstSplat(N, false, /*AllowTruncation=*/true);
   if (!Const)
     return std::nullopt;
 
-  const APInt &CVal = Const->getAPIntValue();
+  EVT VT = N->getValueType(0);
+  const APInt CVal = Const->getAPIntValue().trunc(VT.getScalarSizeInBits());
   switch (TLI->getBooleanContents(N.getValueType())) {
   case TargetLowering::ZeroOrOneBooleanContent:
     if (CVal.isOne())
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 16a10cf4d0323..e0597988e8907 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9054,14 +9054,14 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
   APInt QNaNBitMask =
       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
-  APInt InvertionMask = APInt::getAllOnes(ResultVT.getScalarSizeInBits());
+  APInt InversionMask = APInt::getAllOnes(ResultVT.getScalarSizeInBits());
 
   SDValue ValueMaskV = DAG.getConstant(ValueMask, DL, IntVT);
   SDValue SignBitV = DAG.getConstant(SignBit, DL, IntVT);
   SDValue ExpMaskV = DAG.getConstant(ExpMask, DL, IntVT);
   SDValue ZeroV = DAG.getConstant(0, DL, IntVT);
   SDValue InfV = DAG.getConstant(Inf, DL, IntVT);
-  SDValue ResultInvertionMask = DAG.getConstant(InvertionMask, DL, ResultVT);
+  SDValue ResultInversionMask = DAG.getConstant(InversionMask, DL, ResultVT);
 
   SDValue Res;
   const auto appendResult = [&](SDValue PartialRes) {
@@ -9205,7 +9205,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
       PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
     else if (PartialCheck == fcPosNormal) {
       SDValue PosSignV =
-          DAG.getNode(ISD::XOR, DL, ResultVT, SignV, ResultInvertionMask);
+          DAG.getNode(ISD::XOR, DL, ResultVT, SignV, ResultInversionMask);
       PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, PosSignV);
     }
     if (IsF80)
@@ -9217,7 +9217,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   if (!Res)
     return DAG.getConstant(IsInverted, DL, ResultVT);
   if (IsInverted)
-    Res = DAG.getNode(ISD::XOR, DL, ResultVT, Res, ResultInvertionMask);
+    Res = DAG.getNode(ISD::XOR, DL, ResultVT, Res, ResultInversionMask);
   return Res;
 }
 
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 660a1a4d7ec47..518a9339d8d11 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -214,6 +214,24 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
       Reg1.isPhysical() ? MI.getOperand(Idx1).isRenamable() : false;
   bool Reg2IsRenamable =
       Reg2.isPhysical() ? MI.getOperand(Idx2).isRenamable() : false;
+
+  // For a case like this:
+  //   %0.sub = INST %0.sub(tied), %1.sub, implicit-def %0
+  // we need to update the implicit-def after commuting to result in:
+  //   %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1
+  SmallVector<unsigned> UpdateImplicitDefIdx;
+  if (HasDef && MI.hasImplicitDef()) {
+    const TargetRegisterInfo *TRI =
+        MI.getMF()->getSubtarget().getRegisterInfo();
+    for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) {
+      Register ImplReg = MO.getReg();
+      if ((ImplReg.isVirtual() && ImplReg == Reg0) ||
+          (ImplReg.isPhysical() && Reg0.isPhysical() &&
+           TRI->isSubRegisterEq(ImplReg, Reg0)))
+        UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands());
+    }
+  }
+
   // If destination is tied to either of the commuted source register, then
   // it must be updated.
   if (HasDef && Reg0 == Reg1 &&
@@ -238,15 +256,10 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   }
 
   if (HasDef) {
-    // Use `substituteRegister` so that for a case like this:
-    //   %0.sub = INST %0.sub(tied), %1.sub, implicit-def %0
-    // the implicit-def is also updated, to result in:
-    //   %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1
-    const TargetRegisterInfo &TRI =
-        *MI.getMF()->getSubtarget().getRegisterInfo();
-    Register FromReg = CommutedMI->getOperand(0).getReg();
-    CommutedMI->substituteRegister(FromReg, Reg0, /*SubRegIdx=*/0, TRI);
+    CommutedMI->getOperand(0).setReg(Reg0);
     CommutedMI->getOperand(0).setSubReg(SubReg0);
+    for (unsigned Idx : UpdateImplicitDefIdx)
+      CommutedMI->getOperand(Idx).setReg(Reg0);
   }
   CommutedMI->getOperand(Idx2).setReg(Reg1);
   CommutedMI->getOperand(Idx1).setReg(Reg2);
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 1e325d76bd515..12d31f809f882 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -239,15 +239,14 @@ Error makeUnexpectedOpcodeError(const LinkGraph &G, const ThumbRelocation &R,
                                 Edge::Kind Kind) {
   return make_error<JITLinkError>(
       formatv("Invalid opcode [ {0:x4}, {1:x4} ] for relocation: {2}",
-              static_cast<uint16_t>(R.Hi), static_cast<uint16_t>(R.Lo),
-              G.getEdgeKindName(Kind)));
+              R.Hi.value(), R.Lo.value(), G.getEdgeKindName(Kind)));
 }
 
 Error makeUnexpectedOpcodeError(const LinkGraph &G, const ArmRelocation &R,
                                 Edge::Kind Kind) {
   return make_error<JITLinkError>(
-      formatv("Invalid opcode {0:x8} for relocation: {1}",
-              static_cast<uint32_t>(R.Wd), G.getEdgeKindName(Kind)));
+      formatv("Invalid opcode {0:x8} for relocation: {1}", R.Wd.value(),
+              G.getEdgeKindName(Kind)));
 }
 
 template <EdgeKind_aarch32 K> constexpr bool isArm() {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index db792a3b52d24..170224616ac64 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2617,7 +2617,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
 Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
     const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
     AttributeList FuncAttrs) {
-  InsertPointTy SavedIP = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
@@ -2630,6 +2630,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
   WcFunc->addParamAttr(1, Attribute::NoUndef);
   BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
   Builder.SetInsertPoint(EntryBB);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // ReduceList: thread local Reduce list.
   // At the stage of the computation when this function is called, partially
@@ -2844,7 +2845,6 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
   }
 
   Builder.CreateRetVoid();
-  Builder.restoreIP(SavedIP);
 
   return WcFunc;
 }
@@ -2853,6 +2853,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     AttributeList FuncAttrs) {
   LLVMContext &Ctx = M.getContext();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   FunctionType *FuncTy =
       FunctionType::get(Builder.getVoidTy(),
                         {Builder.getPtrTy(), Builder.getInt16Ty(),
@@ -2871,6 +2872,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
   SarFunc->addParamAttr(3, Attribute::SExt);
   BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
   Builder.SetInsertPoint(EntryBB);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Thread local Reduce list used to host the values of data to be reduced.
   Argument *ReduceListArg = SarFunc->getArg(0);
@@ -3017,7 +3019,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
 Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
     AttributeList FuncAttrs) {
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3033,6 +3035,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3120,14 +3123,13 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
   }
 
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldIP);
   return LtGCFunc;
 }
 
 Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     Type *ReductionsBufferTy, AttributeList FuncAttrs) {
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3143,6 +3145,7 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3203,14 +3206,13 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
   Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
       ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldIP);
   return LtGRFunc;
 }
 
 Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
     AttributeList FuncAttrs) {
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3226,6 +3228,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3311,14 +3314,13 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
   }
 
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldIP);
   return LtGCFunc;
 }
 
 Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     Type *ReductionsBufferTy, AttributeList FuncAttrs) {
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   LLVMContext &Ctx = M.getContext();
   auto *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3334,6 +3336,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3394,7 +3397,6 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
   Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
       ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldIP);
   return LtGRFunc;
 }
 
@@ -3407,6 +3409,7 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
 Expected<Function *> OpenMPIRBuilder::createReductionFunction(
     StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
     ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
                                    {Builder.getPtrTy(), Builder.getPtrTy()},
                                    /* IsVarArg */ false);
@@ -3419,6 +3422,7 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
   BasicBlock *EntryBB =
       BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
   Builder.SetInsertPoint(EntryBB);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Need to alloca memory here and deal with the pointers before getting
   // LHS/RHS pointers out
@@ -3746,10 +3750,12 @@ static Error populateReductionFunction(
     Function *ReductionFunc,
     ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
     IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
   Module *Module = ReductionFunc->getParent();
   BasicBlock *ReductionFuncBlock =
       BasicBlock::Create(Module->getContext(), "", ReductionFunc);
   Builder.SetInsertPoint(ReductionFuncBlock);
+  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
   Value *LHSArrayPtr = nullptr;
   Value *RHSArrayPtr = nullptr;
   if (IsGPU) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 4e09f847627af..84a56058de834 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1325,6 +1325,63 @@ return wrap(unwrap(Builder)->createEnumerationType(
     LineNumber, SizeInBits, AlignInBits, Elts, unwrapDI<DIType>(ClassTy)));
 }
 
+LLVMMetadataRef LLVMDIBuilderCreateSetType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, LLVMMetadataRef BaseTy) {
+  return wrap(unwrap(Builder)->createSetType(
+      unwrapDI<DIScope>(Scope), {Name, NameLen}, unwrapDI<DIFile>(File),
+      LineNumber, SizeInBits, AlignInBits, unwrapDI<DIType>(BaseTy)));
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateSubrangeType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned LineNo, LLVMMetadataRef File, uint64_t SizeInBits,
+    uint32_t AlignInBits, LLVMDIFlags Flags, LLVMMetadataRef BaseTy,
+    LLVMMetadataRef LowerBound, LLVMMetadataRef UpperBound,
+    LLVMMetadataRef Stride, LLVMMetadataRef Bias) {
+  return wrap(unwrap(Builder)->createSubrangeType(
+      {Name, NameLen}, unwrapDI<DIFile>(File), LineNo, unwrapDI<DIScope>(Scope),
+      SizeInBits, AlignInBits, map_from_llvmDIFlags(Flags),
+      unwrapDI<DIType>(BaseTy), unwrap(LowerBound), unwrap(UpperBound),
+      unwrap(Stride), unwrap(Bias)));
+}
+
+/// MD may be nullptr, a DIExpression or DIVariable.
+PointerUnion<DIExpression *, DIVariable *> unwrapExprVar(LLVMMetadataRef MD) {
+  if (!MD)
+    return nullptr;
+  MDNode *MDN = unwrapDI<MDNode>(MD);
+  if (auto *E = dyn_cast<DIExpression>(MDN))
+    return E;
+  assert(isa<DIVariable>(MDN) && "Expected DIExpression or DIVariable");
+  return cast<DIVariable>(MDN);
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateDynamicArrayType(
+    LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+    size_t NameLen, unsigned LineNo, LLVMMetadataRef File, uint64_t Size,
+    uint32_t AlignInBits, LLVMMetadataRef Ty, LLVMMetadataRef *Subscripts,
+    unsigned NumSubscripts, LLVMMetadataRef DataLocation,
+    LLVMMetadataRef Associated, LLVMMetadataRef Allocated, LLVMMetadataRef Rank,
+    LLVMMetadataRef BitStride) {
+  auto Subs =
+      unwrap(Builder)->getOrCreateArray({unwrap(Subscripts), NumSubscripts});
+  return wrap(unwrap(Builder)->createArrayType(
+      unwrapDI<DIScope>(Scope), {Name, NameLen}, unwrapDI<DIFile>(File), LineNo,
+      Size, AlignInBits, unwrapDI<DIType>(Ty), Subs,
+      unwrapExprVar(DataLocation), unwrapExprVar(Associated),
+      unwrapExprVar(Allocated), unwrapExprVar(Rank), unwrap(BitStride)));
+}
+
+void LLVMReplaceArrays(LLVMDIBuilderRef Builder, LLVMMetadataRef *T,
+                       LLVMMetadataRef *Elements, unsigned NumElements) {
+  auto CT = unwrap<DICompositeType>(*T);
+  auto Elts =
+      unwrap(Builder)->getOrCreateArray({unwrap(Elements), NumElements});
+  unwrap(Builder)->replaceArrays(CT, Elts);
+}
+
 LLVMMetadataRef LLVMDIBuilderCreateUnionType(
   LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
   size_t NameLen, LLVMMetadataRef File, unsigned LineNumber,
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index 96829bd062a78..dc758785e40d5 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -24,10 +24,10 @@ constexpr size_t StringRef::npos;
 
 // strncasecmp() is not available on non-POSIX systems, so define an
 // alternative function here.
-static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
-  for (size_t I = 0; I < Length; ++I) {
-    unsigned char LHC = toLower(LHS[I]);
-    unsigned char RHC = toLower(RHS[I]);
+static int ascii_strncasecmp(StringRef LHS, StringRef RHS) {
+  for (auto [LC, RC] : zip_equal(LHS, RHS)) {
+    unsigned char LHC = toLower(LC);
+    unsigned char RHC = toLower(RC);
     if (LHC != RHC)
       return LHC < RHC ? -1 : 1;
   }
@@ -35,8 +35,8 @@ static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
 }
 
 int StringRef::compare_insensitive(StringRef RHS) const {
-  if (int Res =
-          ascii_strncasecmp(data(), RHS.data(), std::min(size(), RHS.size())))
+  size_t Min = std::min(size(), RHS.size());
+  if (int Res = ascii_strncasecmp(take_front(Min), RHS.take_front(Min)))
     return Res;
   if (size() == RHS.size())
     return 0;
@@ -45,13 +45,12 @@ int StringRef::compare_insensitive(StringRef RHS) const {
 
 bool StringRef::starts_with_insensitive(StringRef Prefix) const {
   return size() >= Prefix.size() &&
-         ascii_strncasecmp(data(), Prefix.data(), Prefix.size()) == 0;
+         ascii_strncasecmp(take_front(Prefix.size()), Prefix) == 0;
 }
 
 bool StringRef::ends_with_insensitive(StringRef Suffix) const {
   return size() >= Suffix.size() &&
-         ascii_strncasecmp(end() - Suffix.size(), Suffix.data(),
-                           Suffix.size()) == 0;
+         ascii_strncasecmp(take_back(Suffix.size()), Suffix) == 0;
 }
 
 size_t StringRef::find_insensitive(char C, size_t From) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 331c8036e26f1..f7de61f044a7d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1352,6 +1352,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
       }
     }
+    if (Subtarget->hasFullFP16())
+      setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
+
     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
@@ -16046,9 +16049,19 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
+  // widening by inserting zeroes.
+  if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
+      SrcVT == MVT::v2f16) {
+    SDLoc DL(Op);
+    return DAG.getNode(ISD::FADD, DL, MVT::f16,
+                       DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
+                       DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
+  }
 
   // Try to lower fixed length reductions to SVE.
-  EVT SrcVT = Src.getValueType();
   bool OverrideNEON = !Subtarget->isNeonAvailable() ||
                       Op.getOpcode() == ISD::VECREDUCE_AND ||
                       Op.getOpcode() == ISD::VECREDUCE_OR ||
@@ -17834,17 +17847,19 @@ bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
 
 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
     const MachineFunction &MF, EVT VT) const {
-  VT = VT.getScalarType();
+  EVT ScalarVT = VT.getScalarType();
 
-  if (!VT.isSimple())
+  if (!ScalarVT.isSimple())
     return false;
 
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (ScalarVT.getSimpleVT().SimpleTy) {
   case MVT::f16:
     return Subtarget->hasFullFP16();
   case MVT::f32:
   case MVT::f64:
     return true;
+  case MVT::bf16:
+    return VT.isScalableVector() && Subtarget->hasSVEB16B16();
   default:
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8847c62690714..c1474773faa76 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7868,62 +7868,48 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case AArch64MachineCombinerPattern::MULADDWI_OP1:
-  case AArch64MachineCombinerPattern::MULADDXI_OP1: {
+  case AArch64MachineCombinerPattern::MULADDXI_OP1:
+  case AArch64MachineCombinerPattern::MULSUBWI_OP1:
+  case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
     // MUL I=A,B,0
-    // ADD R,I,Imm
-    // ==> MOV V, Imm
+    // ADD/SUB R,I,Imm
+    // ==> MOV V, Imm/-Imm
     // ==> MADD R,A,B,V
     // --- Create(MADD);
-    const TargetRegisterClass *OrrRC;
-    unsigned BitSize, OrrOpc, ZeroReg;
-    if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
-      OrrOpc = AArch64::ORRWri;
-      OrrRC = &AArch64::GPR32spRegClass;
+    const TargetRegisterClass *RC;
+    unsigned BitSize, MovImm;
+    if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
+        Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
+      MovImm = AArch64::MOVi32imm;
+      RC = &AArch64::GPR32spRegClass;
       BitSize = 32;
-      ZeroReg = AArch64::WZR;
       Opc = AArch64::MADDWrrr;
       RC = &AArch64::GPR32RegClass;
     } else {
-      OrrOpc = AArch64::ORRXri;
-      OrrRC = &AArch64::GPR64spRegClass;
+      MovImm = AArch64::MOVi64imm;
+      RC = &AArch64::GPR64spRegClass;
       BitSize = 64;
-      ZeroReg = AArch64::XZR;
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    Register NewVR = MRI.createVirtualRegister(OrrRC);
+    Register NewVR = MRI.createVirtualRegister(RC);
     uint64_t Imm = Root.getOperand(2).getImm();
 
     if (Root.getOperand(3).isImm()) {
       unsigned Val = Root.getOperand(3).getImm();
       Imm = Imm << Val;
     }
-    uint64_t UImm = SignExtend64(Imm, BitSize);
-    // The immediate can be composed via a single instruction.
+    bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
+                 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
+    uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
+    // Check that the immediate can be composed via a single instruction.
     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
     if (Insn.size() != 1)
       return;
-    auto MovI = Insn.begin();
-    MachineInstrBuilder MIB1;
-    // MOV is an alias for one of three instructions: movz, movn, and orr.
-    if (MovI->Opcode == OrrOpc)
-      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
-                 .addReg(ZeroReg)
-                 .addImm(MovI->Op2);
-    else {
-      if (BitSize == 32)
-        assert((MovI->Opcode == AArch64::MOVNWi ||
-                MovI->Opcode == AArch64::MOVZWi) &&
-               "Expected opcode");
-      else
-        assert((MovI->Opcode == AArch64::MOVNXi ||
-                MovI->Opcode == AArch64::MOVZXi) &&
-               "Expected opcode");
-      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
-                 .addImm(MovI->Op1)
-                 .addImm(MovI->Op2);
-    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
+            .addImm(IsSub ? -Imm : Imm);
     InsInstrs.push_back(MIB1);
     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
@@ -7977,67 +7963,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
-  case AArch64MachineCombinerPattern::MULSUBWI_OP1:
-  case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
-    // MUL I=A,B,0
-    // SUB R,I, Imm
-    // ==> MOV  V, -Imm
-    // ==> MADD R,A,B,V // = -Imm + A*B
-    // --- Create(MADD);
-    const TargetRegisterClass *OrrRC;
-    unsigned BitSize, OrrOpc, ZeroReg;
-    if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
-      OrrOpc = AArch64::ORRWri;
-      OrrRC = &AArch64::GPR32spRegClass;
-      BitSize = 32;
-      ZeroReg = AArch64::WZR;
-      Opc = AArch64::MADDWrrr;
-      RC = &AArch64::GPR32RegClass;
-    } else {
-      OrrOpc = AArch64::ORRXri;
-      OrrRC = &AArch64::GPR64spRegClass;
-      BitSize = 64;
-      ZeroReg = AArch64::XZR;
-      Opc = AArch64::MADDXrrr;
-      RC = &AArch64::GPR64RegClass;
-    }
-    Register NewVR = MRI.createVirtualRegister(OrrRC);
-    uint64_t Imm = Root.getOperand(2).getImm();
-    if (Root.getOperand(3).isImm()) {
-      unsigned Val = Root.getOperand(3).getImm();
-      Imm = Imm << Val;
-    }
-    uint64_t UImm = SignExtend64(-Imm, BitSize);
-    // The immediate can be composed via a single instruction.
-    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
-    AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
-    if (Insn.size() != 1)
-      return;
-    auto MovI = Insn.begin();
-    MachineInstrBuilder MIB1;
-    // MOV is an alias for one of three instructions: movz, movn, and orr.
-    if (MovI->Opcode == OrrOpc)
-      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
-                 .addReg(ZeroReg)
-                 .addImm(MovI->Op2);
-    else {
-      if (BitSize == 32)
-        assert((MovI->Opcode == AArch64::MOVNWi ||
-                MovI->Opcode == AArch64::MOVZWi) &&
-               "Expected opcode");
-      else
-        assert((MovI->Opcode == AArch64::MOVNXi ||
-                MovI->Opcode == AArch64::MOVZXi) &&
-               "Expected opcode");
-      MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
-                 .addImm(MovI->Op1)
-                 .addImm(MovI->Op2);
-    }
-    InsInstrs.push_back(MIB1);
-    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
-    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
-    break;
-  }
   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
     Opc = AArch64::MLAv8i8;
     RC = &AArch64::FPR64RegClass;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dde4e7ab0e890..e6b22695761e7 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2529,31 +2529,63 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     return E;
   }
 
-  for (unsigned Count = 0; MBBI != E && Count < Limit;
-       MBBI = next_nodbg(MBBI, E)) {
-    MachineInstr &MI = *MBBI;
-
-    // Don't count transient instructions towards the search limit since there
-    // may be different numbers of them if e.g. debug information is present.
-    if (!MI.isTransient())
-      ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
-      return MBBI;
+  unsigned Count = 0;
+  MachineBasicBlock *CurMBB = I->getParent();
+  // choice of next block to visit is liveins-based
+  bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
+
+  while (true) {
+    for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
+         MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(MBBI, CurEnd)) {
+      MachineInstr &MI = *MBBI;
+
+      // Don't count transient instructions towards the search limit since there
+      // may be different numbers of them if e.g. debug information is present.
+      if (!MI.isTransient())
+        ++Count;
+
+      // If we found a match, return it.
+      if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
+        return MBBI;
+
+      // Update the status of what the instruction clobbered and used.
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+
+      // Otherwise, if the base register is used or modified, we have no match,
+      // so return early. If we are optimizing SP, do not allow instructions
+      // that may load or store in between the load and the optimized value
+      // update.
+      if (!ModifiedRegUnits.available(BaseReg) ||
+          !UsedRegUnits.available(BaseReg) ||
+          (BaseRegSP && MBBI->mayLoadOrStore()))
+        return E;
+    }
 
-    // Update the status of what the instruction clobbered and used.
-    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+    if (!VisitSucc || Limit <= Count)
+      break;
 
-    // Otherwise, if the base register is used or modified, we have no match, so
-    // return early.
-    // If we are optimizing SP, do not allow instructions that may load or store
-    // in between the load and the optimized value update.
-    if (!ModifiedRegUnits.available(BaseReg) ||
-        !UsedRegUnits.available(BaseReg) ||
-        (BaseRegSP && MBBI->mayLoadOrStore()))
-      return E;
+    // Try to go downward to successors along a CF path w/o side enters
+    // such that BaseReg is alive along it but not at its exits
+    MachineBasicBlock *SuccToVisit = nullptr;
+    unsigned LiveSuccCount = 0;
+    for (MachineBasicBlock *Succ : CurMBB->successors()) {
+      for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
+        if (Succ->isLiveIn(*AI)) {
+          if (LiveSuccCount++)
+            return E;
+          if (Succ->pred_size() == 1)
+            SuccToVisit = Succ;
+          break;
+        }
+      }
+    }
+    if (!SuccToVisit)
+      break;
+    CurMBB = SuccToVisit;
+    MBBI = CurMBB->begin();
   }
+
   return E;
 }
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 3b7e5a6c2b1cf..a0320f919e8c5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2490,6 +2490,8 @@ multiclass sve_fp_3op_p_zds_a_bfloat<bits<2> opc, string asm, string Ps,
            SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME, "", 0>;
 
   def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+  def : SVE_4_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME)>;
+  def : SVE_4_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME)>;
 }
 
 class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 91ace4d2b7f16..31420caca0899 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2488,6 +2488,10 @@ def HasFmaakFmamkF32Insts :
   Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
   AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>;
 
+def HasFmaakFmamkF64Insts :
+  Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
+  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
 def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
   AssemblerPredicate<(all_of FeatureImageInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 92fa66074b26e..6439230b8769f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -336,6 +336,20 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32);
   }
 
+  bool isRegOrInlineImmWithFP64InputMods() const {
+    return isRegOrInline(AMDGPU::VS_64RegClassID, MVT::f64);
+  }
+
+  bool isVRegWithInputMods(unsigned RCID) const { return isRegClass(RCID); }
+
+  bool isVRegWithFP32InputMods() const {
+    return isVRegWithInputMods(AMDGPU::VGPR_32RegClassID);
+  }
+
+  bool isVRegWithFP64InputMods() const {
+    return isVRegWithInputMods(AMDGPU::VReg_64RegClassID);
+  }
+
   bool isPackedFP16InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::v2f16);
   }
@@ -531,7 +545,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
 
-  bool isVCSrcB64() const {
+  bool isVCSrc_b64() const {
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
@@ -557,7 +571,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
 
-  bool isVCSrcF64() const {
+  bool isVCSrc_f64() const {
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
@@ -605,7 +619,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVCSrc_f32() || isLiteralImm(MVT::i32) || isExpr();
   }
 
-  bool isVSrc_b64() const { return isVCSrcF64() || isLiteralImm(MVT::i64); }
+  bool isVSrc_b64() const { return isVCSrc_f64() || isLiteralImm(MVT::i64); }
 
   bool isVSrcT_b16() const { return isVCSrcT_b16() || isLiteralImm(MVT::i16); }
 
@@ -621,15 +635,11 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
   bool isVSrc_v2b16() const { return isVSrc_b16() || isLiteralImm(MVT::v2i16); }
 
-  bool isVCSrcV2FP32() const {
-    return isVCSrcF64();
-  }
+  bool isVCSrcV2FP32() const { return isVCSrc_f64(); }
 
   bool isVSrc_v2f32() const { return isVSrc_f64() || isLiteralImm(MVT::v2f32); }
 
-  bool isVCSrcV2INT32() const {
-    return isVCSrcB64();
-  }
+  bool isVCSrc_v2b32() const { return isVCSrc_b64(); }
 
   bool isVSrc_v2b32() const { return isVSrc_b64() || isLiteralImm(MVT::v2i32); }
 
@@ -637,7 +647,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVCSrc_f32() || isLiteralImm(MVT::f32) || isExpr();
   }
 
-  bool isVSrc_f64() const { return isVCSrcF64() || isLiteralImm(MVT::f64); }
+  bool isVSrc_f64() const { return isVCSrc_f64() || isLiteralImm(MVT::f64); }
 
   bool isVSrcT_bf16() const { return isVCSrcTBF16() || isLiteralImm(MVT::bf16); }
 
@@ -941,6 +951,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isLiteralImm(MVT::f16);
   }
 
+  bool isKImmFP64() const { return isLiteralImm(MVT::f64); }
+
   bool isMem() const override {
     return false;
   }
@@ -1531,6 +1543,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }
 
+  bool isGFX1250() const { return AMDGPU::isGFX1250(getSTI()); }
+
   bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); }
 
   bool isGFX10_BEncoding() const {
@@ -1782,8 +1796,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSOPLiteral(const MCInst &Inst) const;
   bool validateConstantBusLimitations(const MCInst &Inst, const OperandVector &Operands);
-  bool validateVOPDRegBankConstraints(const MCInst &Inst,
-                                      const OperandVector &Operands);
+  std::optional<unsigned> checkVOPDRegBankConstraints(const MCInst &Inst,
+                                                      bool AsVOPD3);
+  bool validateVOPD(const MCInst &Inst, const OperandVector &Operands);
+  bool tryVOPD(const MCInst &Inst);
+  bool tryVOPD3(const MCInst &Inst);
+  bool tryAnotherVOPDEncoding(const MCInst &Inst);
+
   bool validateIntClampSupported(const MCInst &Inst);
   bool validateMIMGAtomicDMask(const MCInst &Inst);
   bool validateMIMGGatherDMask(const MCInst &Inst);
@@ -1986,6 +2005,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+  case AMDGPU::OPERAND_KIMM64:
     return &APFloat::IEEEdouble();
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
@@ -2326,6 +2346,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
       // in predicate methods (isLiteralImm())
       llvm_unreachable("fp literal in 64-bit integer instruction.");
 
+    case AMDGPU::OPERAND_KIMM64:
+      Inst.addOperand(MCOperand::createImm(Val));
+      setImmKindMandatoryLiteral();
+      return;
+
     case AMDGPU::OPERAND_REG_IMM_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_BF16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
@@ -2531,6 +2556,13 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     Inst.addOperand(MCOperand::createImm(Literal.getLoBits(16).getZExtValue()));
     setImmKindMandatoryLiteral();
     return;
+  case AMDGPU::OPERAND_KIMM64:
+    if ((isInt<32>(Val) || isUInt<32>(Val)) && !getModifiers().Lit64)
+      Val <<= 32;
+
+    Inst.addOperand(MCOperand::createImm(Val));
+    setImmKindMandatoryLiteral();
+    return;
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -3569,6 +3601,13 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     }
   }
 
+  // Asm can first try to match VOPD or VOPD3. By failing early here with
+  // Match_InvalidOperand, the parser will retry parsing as VOPD3 or VOPD.
+  // Checking later during validateInstruction does not give a chance to retry
+  // parsing as a different encoding.
+  if (tryAnotherVOPDEncoding(Inst))
+    return Match_InvalidOperand;
+
   return Match_Success;
 }
 
@@ -3749,8 +3788,10 @@ static OperandIndices getSrcOperandIndices(unsigned Opcode,
 
     return {getNamedOperandIdx(Opcode, OpName::src0X),
             getNamedOperandIdx(Opcode, OpName::vsrc1X),
+            getNamedOperandIdx(Opcode, OpName::vsrc2X),
             getNamedOperandIdx(Opcode, OpName::src0Y),
             getNamedOperandIdx(Opcode, OpName::vsrc1Y),
+            getNamedOperandIdx(Opcode, OpName::vsrc2Y),
             ImmXIdx,
             ImmIdx};
   }
@@ -3880,12 +3921,12 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
   return false;
 }
 
-bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
-    const MCInst &Inst, const OperandVector &Operands) {
+std::optional<unsigned>
+AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
 
   const unsigned Opcode = Inst.getOpcode();
   if (!isVOPD(Opcode))
-    return true;
+    return {};
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
 
@@ -3896,16 +3937,64 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
                : MCRegister();
   };
 
-  // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache.
-  bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12;
+  // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
+  // source-cache.
+  bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
+                 Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
+                 Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250;
+  bool AllowSameVGPR = isGFX1250();
+
+  if (AsVOPD3) { // Literal constants are not allowed with VOPD3.
+    for (auto OpName : {OpName::src0X, OpName::src0Y}) {
+      int I = getNamedOperandIdx(Opcode, OpName);
+      const MCOperand &Op = Inst.getOperand(I);
+      if (!Op.isImm())
+        continue;
+      int64_t Imm = Op.getImm();
+      if (!AMDGPU::isInlinableLiteral32(Imm, hasInv2PiInlineImm()) &&
+          !AMDGPU::isInlinableLiteral64(Imm, hasInv2PiInlineImm()))
+        return (unsigned)I;
+    }
+
+    for (auto OpName : {OpName::vsrc1X, OpName::vsrc1Y, OpName::vsrc2X,
+                        OpName::vsrc2Y, OpName::imm}) {
+      int I = getNamedOperandIdx(Opcode, OpName);
+      if (I == -1)
+        continue;
+      const MCOperand &Op = Inst.getOperand(I);
+      if (Op.isImm())
+        return (unsigned)I;
+    }
+  }
 
   const auto &InstInfo = getVOPDInstInfo(Opcode, &MII);
-  auto InvalidCompOprIdx =
-      InstInfo.getInvalidCompOperandIndex(getVRegIdx, SkipSrc);
-  if (!InvalidCompOprIdx)
+  auto InvalidCompOprIdx = InstInfo.getInvalidCompOperandIndex(
+      getVRegIdx, *TRI, SkipSrc, AllowSameVGPR, AsVOPD3);
+
+  return InvalidCompOprIdx;
+}
+
+bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst,
+                                   const OperandVector &Operands) {
+
+  unsigned Opcode = Inst.getOpcode();
+  bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3;
+
+  if (AsVOPD3) {
+    for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
+      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+      if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) &&
+          (Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS))
+        Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions");
+    }
+  }
+
+  auto InvalidCompOprIdx = checkVOPDRegBankConstraints(Inst, AsVOPD3);
+  if (!InvalidCompOprIdx.has_value())
     return true;
 
   auto CompOprIdx = *InvalidCompOprIdx;
+  const auto &InstInfo = getVOPDInstInfo(Opcode, &MII);
   auto ParsedIdx =
       std::max(InstInfo[VOPD::X].getIndexInParsedOperands(CompOprIdx),
                InstInfo[VOPD::Y].getIndexInParsedOperands(CompOprIdx));
@@ -3913,7 +4002,10 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
 
   auto Loc = ((AMDGPUOperand &)*Operands[ParsedIdx]).getStartLoc();
   if (CompOprIdx == VOPD::Component::DST) {
-    Error(Loc, "one dst register must be even and the other odd");
+    if (AsVOPD3)
+      Error(Loc, "dst registers must be distinct");
+    else
+      Error(Loc, "one dst register must be even and the other odd");
   } else {
     auto CompSrcIdx = CompOprIdx - VOPD::Component::DST_NUM;
     Error(Loc, Twine("src") + Twine(CompSrcIdx) +
@@ -3923,6 +4015,75 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
   return false;
 }
 
+// \returns true if \p Inst does not satisfy VOPD constraints, but can be
+// potentially used as VOPD3 with the same operands.
+bool AMDGPUAsmParser::tryVOPD3(const MCInst &Inst) {
+  // First check if it fits VOPD
+  auto InvalidCompOprIdx = checkVOPDRegBankConstraints(Inst, false);
+  if (!InvalidCompOprIdx.has_value())
+    return false;
+
+  // Then if it fits VOPD3
+  InvalidCompOprIdx = checkVOPDRegBankConstraints(Inst, true);
+  if (InvalidCompOprIdx.has_value()) {
+    // If failed operand is dst it is better to show error about VOPD3
+    // instruction as it has more capabilities and error message will be
+    // more informative. If the dst is not legal for VOPD3, then it is not
+    // legal for VOPD either.
+    if (*InvalidCompOprIdx == VOPD::Component::DST)
+      return true;
+
+    // Otherwise prefer VOPD as we may find ourselves in an awkward situation
+    // with a conflict in tied implicit src2 of fmac and no asm operand to
+    // to point to.
+    return false;
+  }
+  return true;
+}
+
+// \returns true is a VOPD3 instruction can be also represented as a shorter
+// VOPD encoding.
+bool AMDGPUAsmParser::tryVOPD(const MCInst &Inst) {
+  const unsigned Opcode = Inst.getOpcode();
+  const auto &II = getVOPDInstInfo(Opcode, &MII);
+  unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(getSTI());
+  if (!getCanBeVOPD(II[VOPD::X].getOpcode(), EncodingFamily, false).X ||
+      !getCanBeVOPD(II[VOPD::Y].getOpcode(), EncodingFamily, false).Y)
+    return false;
+
+  // This is an awkward exception, VOPD3 variant of V_DUAL_CNDMASK_B32 has
+  // explicit src2 even if it is vcc_lo. If it was parsed as VOPD3 it cannot
+  // be parsed as VOPD which does not accept src2.
+  if (II[VOPD::X].getOpcode() == AMDGPU::V_CNDMASK_B32_e32 ||
+      II[VOPD::Y].getOpcode() == AMDGPU::V_CNDMASK_B32_e32)
+    return false;
+
+  // If any modifiers are set this cannot be VOPD.
+  for (auto OpName : {OpName::src0X_modifiers, OpName::src0Y_modifiers,
+                      OpName::vsrc1X_modifiers, OpName::vsrc1Y_modifiers,
+                      OpName::vsrc2X_modifiers, OpName::vsrc2Y_modifiers}) {
+    int I = getNamedOperandIdx(Opcode, OpName);
+    if (I == -1)
+      continue;
+    if (Inst.getOperand(I).getImm())
+      return false;
+  }
+
+  return !tryVOPD3(Inst);
+}
+
+// VOPD3 has more relaxed register constraints than VOPD. We prefer shorter VOPD
+// form but switch to VOPD3 otherwise.
+bool AMDGPUAsmParser::tryAnotherVOPDEncoding(const MCInst &Inst) {
+  const unsigned Opcode = Inst.getOpcode();
+  if (!isGFX1250() || !isVOPD(Opcode))
+    return false;
+
+  if (MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3)
+    return tryVOPD(Inst);
+  return tryVOPD3(Inst);
+}
+
 bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
@@ -4846,7 +5007,7 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
 
   unsigned NumExprs = 0;
   unsigned NumLiterals = 0;
-  uint32_t LiteralValue;
+  uint64_t LiteralValue;
 
   for (int OpIdx : OpIndices) {
     if (OpIdx == -1)
@@ -4860,16 +5021,21 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
 
     if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
       uint64_t Value = static_cast<uint64_t>(MO.getImm());
-      bool IsFP64 = AMDGPU::isSISrcFPOperand(Desc, OpIdx) &&
+      bool IsForcedFP64 =
+          Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_KIMM64 ||
+          (Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_REG_IMM_FP64 &&
+           HasMandatoryLiteral);
+      bool IsFP64 = (IsForcedFP64 || AMDGPU::isSISrcFPOperand(Desc, OpIdx)) &&
                     AMDGPU::getOperandSize(Desc.operands()[OpIdx]) == 8;
       bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP64);
 
-      if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
+      if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value) &&
+          !IsForcedFP64 && (!has64BitLiterals() || Desc.getSize() != 4)) {
         Error(getLitLoc(Operands), "invalid operand for instruction");
         return false;
       }
 
-      if (IsFP64 && IsValid32Op)
+      if (IsFP64 && IsValid32Op && !IsForcedFP64)
         Value = Hi_32(Value);
 
       if (NumLiterals == 0 || LiteralValue != Value) {
@@ -5243,7 +5409,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateConstantBusLimitations(Inst, Operands)) {
     return false;
   }
-  if (!validateVOPDRegBankConstraints(Inst, Operands)) {
+  if (!validateVOPD(Inst, Operands)) {
     return false;
   }
   if (!validateIntClampSupported(Inst)) {
@@ -9244,8 +9410,14 @@ ParseStatus AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
 
 // Create VOPD MCInst operands using parsed assembler operands.
 void AMDGPUAsmParser::cvtVOPD(MCInst &Inst, const OperandVector &Operands) {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+
   auto addOp = [&](uint16_t ParsedOprIdx) { // NOLINT:function pointer
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[ParsedOprIdx]);
+    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+      return;
+    }
     if (Op.isReg()) {
       Op.addRegOperands(Inst, 1);
       return;
@@ -9274,6 +9446,17 @@ void AMDGPUAsmParser::cvtVOPD(MCInst &Inst, const OperandVector &Operands) {
     if (CInfo.hasSrc2Acc())
       addOp(CInfo.getIndexOfDstInParsedOperands());
   }
+
+  int BitOp3Idx =
+      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::bitop3);
+  if (BitOp3Idx != -1) {
+    OptionalImmIndexMap OptIdx;
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands.back());
+    if (Op.isImm())
+      OptIdx[Op.getImmTy()] = Operands.size() - 1;
+
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyBitOp3);
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 7b1ea11d58168..98f7e17e9528c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -353,6 +353,13 @@ static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
 }
 
+static DecodeStatus decodeOperand_KImmFP64(MCInst &Inst, uint64_t Imm,
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeMandatoryLiteral64Constant(Imm));
+}
+
 static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
                                           uint64_t Addr, const void *Decoder) {
   const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
@@ -613,6 +620,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
 
+      if (STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
+        // Return 8 bytes for a potential literal.
+        Bytes = Bytes_.slice(4, MaxInstBytesNum - 4);
+
+        if (isGFX1250() &&
+            tryDecodeInst(DecoderTableGFX125096, MI, DecW, Address, CS))
+          break;
+      }
+
       // Reinitialize Bytes
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
@@ -1467,6 +1483,17 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
   return MCOperand::createImm(Literal);
 }
 
+MCOperand
+AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
+  if (HasLiteral) {
+    if (Literal64 != Val)
+      return errOperand(Val, "More than one unique literal is illegal");
+  }
+  HasLiteral = true;
+  Literal = Literal64 = Val;
+  return MCOperand::createImm(Literal64);
+}
+
 MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 8927f208fd2af..84041001b6ba7 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -178,6 +178,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   static MCOperand decodeIntImmed(unsigned Imm);
 
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
+  MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
   MCOperand decodeLiteralConstant(bool ExtendFP64) const;
   MCOperand decodeLiteral64Constant() const;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index ccc711a0bcc4e..27f40f1705bb4 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -42,11 +42,13 @@ class GCNCreateVOPD {
   class VOPDCombineInfo {
   public:
     VOPDCombineInfo() = default;
-    VOPDCombineInfo(MachineInstr *First, MachineInstr *Second)
-        : FirstMI(First), SecondMI(Second) {}
+    VOPDCombineInfo(MachineInstr *First, MachineInstr *Second,
+                    bool VOPD3 = false)
+        : FirstMI(First), SecondMI(Second), IsVOPD3(VOPD3) {}
 
     MachineInstr *FirstMI;
     MachineInstr *SecondMI;
+    bool IsVOPD3;
   };
 
 public:
@@ -59,9 +61,9 @@ class GCNCreateVOPD {
     unsigned Opc2 = SecondMI->getOpcode();
     unsigned EncodingFamily =
         AMDGPU::getVOPDEncodingFamily(SII->getSubtarget());
-    int NewOpcode =
-        AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
-                            AMDGPU::getVOPDOpcode(Opc2), EncodingFamily);
+    int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1, CI.IsVOPD3),
+                                        AMDGPU::getVOPDOpcode(Opc2, CI.IsVOPD3),
+                                        EncodingFamily, CI.IsVOPD3);
     assert(NewOpcode != -1 &&
            "Should have previously determined this as a possible VOPD\n");
 
@@ -79,12 +81,36 @@ class GCNCreateVOPD {
       VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
     }
 
+    const AMDGPU::OpName Mods[2][3] = {
+        {AMDGPU::OpName::src0X_modifiers, AMDGPU::OpName::vsrc1X_modifiers,
+         AMDGPU::OpName::vsrc2X_modifiers},
+        {AMDGPU::OpName::src0Y_modifiers, AMDGPU::OpName::vsrc1Y_modifiers,
+         AMDGPU::OpName::vsrc2Y_modifiers}};
+    const AMDGPU::OpName SrcMods[3] = {AMDGPU::OpName::src0_modifiers,
+                                       AMDGPU::OpName::src1_modifiers,
+                                       AMDGPU::OpName::src2_modifiers};
+    const unsigned VOPDOpc = VOPDInst->getOpcode();
+
     for (auto CompIdx : VOPD::COMPONENTS) {
       auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum();
+      bool IsVOP3 = SII->isVOP3(*MI[CompIdx]);
       for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; ++CompSrcIdx) {
-        auto MCOprIdx = InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx);
+        if (AMDGPU::hasNamedOperand(VOPDOpc, Mods[CompIdx][CompSrcIdx])) {
+          const MachineOperand *Mod =
+              SII->getNamedOperand(*MI[CompIdx], SrcMods[CompSrcIdx]);
+          VOPDInst.addImm(Mod ? Mod->getImm() : 0);
+        }
+        auto MCOprIdx =
+            InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx, IsVOP3);
         VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
       }
+      if (MI[CompIdx]->getOpcode() == AMDGPU::V_CNDMASK_B32_e32 && CI.IsVOPD3)
+        VOPDInst.addReg(AMDGPU::VCC_LO);
+    }
+
+    if (CI.IsVOPD3) {
+      if (unsigned BitOp2 = AMDGPU::getBitOp2(Opc2))
+        VOPDInst.addImm(BitOp2);
     }
 
     SII->fixImplicitOperands(*VOPDInst);
@@ -109,6 +135,8 @@ class GCNCreateVOPD {
 
     const SIInstrInfo *SII = ST->getInstrInfo();
     bool Changed = false;
+    unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST);
+    bool HasVOPD3 = ST->hasVOPD3();
 
     SmallVector<VOPDCombineInfo> ReplaceCandidates;
 
@@ -124,19 +152,27 @@ class GCNCreateVOPD {
         auto *SecondMI = &*MII;
         unsigned Opc = FirstMI->getOpcode();
         unsigned Opc2 = SecondMI->getOpcode();
-        llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
-        llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
         VOPDCombineInfo CI;
 
-        if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
-          CI = VOPDCombineInfo(FirstMI, SecondMI);
-        else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
-          CI = VOPDCombineInfo(SecondMI, FirstMI);
-        else
-          continue;
-        // checkVOPDRegConstraints cares about program order, but doReplace
-        // cares about X-Y order in the constituted VOPD
-        if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
+        const auto checkVOPD = [&](bool VOPD3) -> bool {
+          llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD =
+              AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3);
+          llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD =
+              AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
+
+          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+            CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3);
+          else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+            CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3);
+          else
+            return false;
+          // checkVOPDRegConstraints cares about program order, but doReplace
+          // cares about X-Y order in the constituted VOPD
+          return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI,
+                                               VOPD3);
+        };
+
+        if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) {
           ReplaceCandidates.push_back(CI);
           ++MII;
         }
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b5ffa64c3a4b4..46b7c2f50780d 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -9,11 +9,11 @@
 // The code produced for "generic" is only useful for tests and cannot
 // reasonably be expected to execute on any particular target.
 def : ProcessorModel<"generic", NoSchedModel,
-  [FeatureGDS, FeatureGWS]
+  []
 >;
 
 def : ProcessorModel<"generic-hsa", NoSchedModel,
-  [FeatureGDS, FeatureGWS, FeatureFlatAddressSpace]
+  [FeatureFlatAddressSpace]
 >;
 
 //===------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index acdd369f17925..e6dd98a104209 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1100,6 +1100,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= GFX10 || hasGFX940Insts();
   }
 
+  bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
+
   bool hasImageInsts() const {
     return HasImageInsts;
   }
@@ -1482,6 +1484,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasGFX1250Insts() const { return GFX1250Insts; }
 
+  bool hasVOPD3() const { return GFX1250Insts; }
+
   // \returns true if target has S_SETPRIO_INC_WG instruction.
   bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 33c208495c500..9e66909e41052 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -36,11 +36,19 @@ using namespace llvm;
 
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
                                    const MachineInstr &FirstMI,
-                                   const MachineInstr &SecondMI) {
+                                   const MachineInstr &SecondMI, bool IsVOPD3) {
   namespace VOPD = AMDGPU::VOPD;
 
   const MachineFunction *MF = FirstMI.getMF();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+  if (IsVOPD3 && !ST.hasVOPD3())
+    return false;
+  if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI)))
+    return false;
+  if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI))
+    return false;
+
   const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
   const MachineRegisterInfo &MRI = MF->getRegInfo();
   // Literals also count against scalar bus limit
@@ -80,23 +88,61 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
   for (auto CompIdx : VOPD::COMPONENTS) {
     const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI;
 
-    const MachineOperand &Src0 = MI.getOperand(VOPD::Component::SRC0);
+    const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
     if (Src0.isReg()) {
       if (!TRI->isVectorRegister(MRI, Src0.getReg())) {
         if (!is_contained(UniqueScalarRegs, Src0.getReg()))
           UniqueScalarRegs.push_back(Src0.getReg());
       }
-    } else {
-      if (!TII.isInlineConstant(MI, VOPD::Component::SRC0))
-        addLiteral(Src0);
+    } else if (!TII.isInlineConstant(Src0)) {
+      if (IsVOPD3)
+        return false;
+      addLiteral(Src0);
     }
 
     if (InstInfo[CompIdx].hasMandatoryLiteral()) {
+      if (IsVOPD3)
+        return false;
+
       auto CompOprIdx = InstInfo[CompIdx].getMandatoryLiteralCompOperandIndex();
       addLiteral(MI.getOperand(CompOprIdx));
     }
     if (MI.getDesc().hasImplicitUseOfPhysReg(AMDGPU::VCC))
       UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
+
+    if (IsVOPD3) {
+      for (auto OpName : {AMDGPU::OpName::src1, AMDGPU::OpName::src2}) {
+        const MachineOperand *Src = TII.getNamedOperand(MI, OpName);
+        if (!Src)
+          continue;
+        if (OpName == AMDGPU::OpName::src2) {
+          if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::bitop3))
+            continue;
+          if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
+            UniqueScalarRegs.push_back(Src->getReg());
+            continue;
+          }
+        }
+        if (!Src->isReg() || !TRI->isVGPR(MRI, Src->getReg()))
+          return false;
+      }
+
+      for (auto OpName : {AMDGPU::OpName::clamp, AMDGPU::OpName::omod,
+                          AMDGPU::OpName::op_sel}) {
+        if (TII.hasModifiersSet(MI, OpName))
+          return false;
+      }
+
+      // Neg is allowed, other modifiers are not. NB: even though sext has the
+      // same value as neg, there are no combinable instructions with sext.
+      for (auto OpName :
+           {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+            AMDGPU::OpName::src2_modifiers}) {
+        const MachineOperand *Mods = TII.getNamedOperand(MI, OpName);
+        if (Mods && (Mods->getImm() & ~SISrcMods::NEG))
+          return false;
+      }
+    }
   }
 
   if (UniqueLiterals.size() > 1)
@@ -104,14 +150,33 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
   if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
     return false;
 
-  // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache.
+  // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
+  // source-cache.
   bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
                  FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
                  SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+  bool AllowSameVGPR = ST.hasGFX1250Insts();
 
-  if (InstInfo.hasInvalidOperand(getVRegIdx, SkipSrc))
+  if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
+                                 IsVOPD3))
     return false;
 
+  if (IsVOPD3) {
+    // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
+    if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+      const MachineOperand &Src2 =
+          *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2);
+      if (!Src2.isImm() || Src2.getImm())
+        return false;
+    }
+    if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+      const MachineOperand &Src2 =
+          *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2);
+      if (!Src2.isImm() || Src2.getImm())
+        return false;
+    }
+  }
+
   LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
                     << "\n\tY: " << SecondMI << "\n");
   return true;
@@ -125,21 +190,28 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
                                        const MachineInstr *FirstMI,
                                        const MachineInstr &SecondMI) {
   const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
+  const GCNSubtarget &ST = STII.getSubtarget();
+  unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST);
   unsigned Opc2 = SecondMI.getOpcode();
-  auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
 
-  // One instruction case
-  if (!FirstMI)
-    return SecondCanBeVOPD.Y;
+  const auto checkVOPD = [&](bool VOPD3) -> bool {
+    auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
 
-  unsigned Opc = FirstMI->getOpcode();
-  auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+    // One instruction case
+    if (!FirstMI)
+      return SecondCanBeVOPD.Y || SecondCanBeVOPD.X;
 
-  if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
-        (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
-    return false;
+    unsigned Opc = FirstMI->getOpcode();
+    auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3);
+
+    if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
+          (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
+      return false;
+
+    return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
+  };
 
-  return checkVOPDRegConstraints(STII, *FirstMI, SecondMI);
+  return checkVOPD(false) || (ST.hasVOPD3() && checkVOPD(true));
 }
 
 namespace {
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
index 22361b9a1a078..f776ae95e79c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
@@ -23,7 +23,7 @@ class SIInstrInfo;
 
 bool checkVOPDRegConstraints(const SIInstrInfo &TII,
                              const MachineInstr &FirstMI,
-                             const MachineInstr &SecondMI);
+                             const MachineInstr &SecondMI, bool IsVOPD3);
 
 std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation();
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 8ce12dfeda779..cb6319ed627ca 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -76,6 +76,18 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
 }
 
+void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  // KIMM64
+  // This part needs to align with AMDGPUInstPrinter::printImmediate64.
+  uint64_t Imm = MI->getOperand(OpNo).getImm();
+  if (STI.hasFeature(AMDGPU::Feature64BitLiterals) && Lo_32(Imm))
+    O << "lit64(" << formatHex(static_cast<uint64_t>(Imm)) << ')';
+  else
+    O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+}
+
 void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O, StringRef BitName) {
   if (MI->getOperand(OpNo).getImm()) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 071e0a9d0fee6..fb803b1f81342 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -42,6 +42,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFP64ImmOperand(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                      StringRef BitName);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f0f655e93f4cc..4bb3942936f04 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -87,9 +87,10 @@ class AMDGPUMCCodeEmitter : public MCCodeEmitter {
                                const MCSubtargetInfo &STI) const;
 
   /// Encode an fp or int literal.
-  std::optional<uint32_t> getLitEncoding(const MCOperand &MO,
-                                         const MCOperandInfo &OpInfo,
-                                         const MCSubtargetInfo &STI) const;
+  std::optional<uint64_t>
+  getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+                 const MCSubtargetInfo &STI,
+                 bool HasMandatoryLiteral = false) const;
 
   void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                              APInt &Inst, APInt &Scratch,
@@ -265,10 +266,9 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI,
              : 255;
 }
 
-std::optional<uint32_t>
-AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
-                                    const MCOperandInfo &OpInfo,
-                                    const MCSubtargetInfo &STI) const {
+std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
+    const MCOperand &MO, const MCOperandInfo &OpInfo,
+    const MCSubtargetInfo &STI, bool HasMandatoryLiteral) const {
   int64_t Imm;
   if (MO.isExpr()) {
     if (!MO.getExpr()->evaluateAsAbsolute(Imm))
@@ -303,9 +303,13 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
 
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-  case AMDGPU::OPERAND_REG_IMM_FP64:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
 
+  case AMDGPU::OPERAND_REG_IMM_FP64: {
+    auto Enc = getLit64Encoding(static_cast<uint64_t>(Imm), STI, true);
+    return (HasMandatoryLiteral && Enc == 255) ? 254 : Enc;
+  }
+
   case AMDGPU::OPERAND_REG_IMM_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
     return getLit16IntEncoding(static_cast<uint32_t>(Imm), STI);
@@ -339,6 +343,7 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
 
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
+  case AMDGPU::OPERAND_KIMM64:
     return MO.getImm();
   default:
     llvm_unreachable("invalid operand size");
@@ -685,7 +690,10 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
-    if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI)) {
+    bool HasMandatoryLiteral =
+        AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm);
+    if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI,
+                                  HasMandatoryLiteral)) {
       Op = *Enc;
       return;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index bd7359a656716..a8649970aa825 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -98,6 +98,8 @@ enum : uint64_t {
   // VINTERP instruction format.
   VINTERP = 1 << 29,
 
+  VOPD3 = 1 << 30,
+
   // High bits - other information.
   VM_CNT = UINT64_C(1) << 32,
   EXP_CNT = UINT64_C(1) << 33,
@@ -227,6 +229,7 @@ enum OperandType : unsigned {
   /// Operand with 32-bit immediate that uses the constant bus.
   OPERAND_KIMM32,
   OPERAND_KIMM16,
+  OPERAND_KIMM64,
 
   /// Operands with an AccVGPR register or inline constant
   OPERAND_REG_INLINE_AC_INT32,
@@ -252,7 +255,7 @@ enum OperandType : unsigned {
   OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
 
   OPERAND_KIMM_FIRST = OPERAND_KIMM32,
-  OPERAND_KIMM_LAST = OPERAND_KIMM16
+  OPERAND_KIMM_LAST = OPERAND_KIMM64
 
 };
 }
@@ -260,16 +263,16 @@ enum OperandType : unsigned {
 // Input operand modifiers bit-masks
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
-  enum : unsigned {
-   NONE = 0,
-   NEG = 1 << 0,   // Floating-point negate modifier
-   ABS = 1 << 1,   // Floating-point absolute modifier
-   SEXT = 1 << 0,  // Integer sign-extend modifier
-   NEG_HI = ABS,   // Floating-point negate high packed component modifier.
-   OP_SEL_0 = 1 << 2,
-   OP_SEL_1 = 1 << 3,
-   DST_OP_SEL = 1 << 3 // VOP3 dst op_sel (share mask with OP_SEL_1)
-  };
+enum : unsigned {
+  NONE = 0,
+  NEG = 1 << 0,  // Floating-point negate modifier
+  ABS = 1 << 1,  // Floating-point absolute modifier
+  SEXT = 1 << 4, // Integer sign-extend modifier
+  NEG_HI = ABS,  // Floating-point negate high packed component modifier.
+  OP_SEL_0 = 1 << 2,
+  OP_SEL_1 = 1 << 3,
+  DST_OP_SEL = 1 << 3 // VOP3 dst op_sel (share mask with OP_SEL_1)
+};
 }
 
 namespace SIOutMods {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index c27d4e0df6fc5..a368bc5d0b1a1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -55,6 +55,8 @@ class InstSI <dag outs, dag ins, string asm = "",
   // VINTERP instruction format.
   field bit VINTERP = 0;
 
+  field bit VOPD3 = 0;
+
   // High bits - other information.
   field bit VM_CNT = 0;
   field bit EXP_CNT = 0;
@@ -195,6 +197,7 @@ class InstSI <dag outs, dag ins, string asm = "",
 
   let TSFlags{28} = LDSDIR;
   let TSFlags{29} = VINTERP;
+  let TSFlags{30} = VOPD3;
 
   let TSFlags{32} = VM_CNT;
   let TSFlags{33} = EXP_CNT;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ec7ef66f2c1aa..ca3af3b48a600 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4442,6 +4442,7 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
   }
   case AMDGPU::OPERAND_KIMM32:
   case AMDGPU::OPERAND_KIMM16:
+  case AMDGPU::OPERAND_KIMM64:
     return false;
   case AMDGPU::OPERAND_INPUT_MODS:
   case MCOI::OPERAND_IMMEDIATE:
@@ -4867,6 +4868,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       break;
     case MCOI::OPERAND_IMMEDIATE:
     case AMDGPU::OPERAND_KIMM32:
+    case AMDGPU::OPERAND_KIMM64:
       // Check if this operand is an immediate.
       // FrameIndex operands will be replaced by immediates, so they are
       // allowed.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6d6c2af7ce490..5e41f875d980a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1316,6 +1316,12 @@ def KImmFP32 : KImmFPOperand<i32>;
 // constant bus.
 def KImmFP16 : KImmFPOperand<i16>;
 
+// 64-bit VALU immediate operand that uses the constant bus.
+def KImmFP64 : KImmFPOperand<i64> {
+  let DecoderMethod = "decodeOperand_KImmFP64";
+  let PrintMethod = "printFP64ImmOperand";
+}
+
 class FPInputModsMatchClass <int opSize> : AsmOperandClass {
   let Name = "RegOrImmWithFP"#opSize#"InputMods";
   let ParserMethod = "parseRegOrImmWithFPInputMods";
@@ -1327,6 +1333,11 @@ class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> {
   let PredicateMethod = "isRegOrInlineImmWithFP"#opSize#"InputMods";
 }
 
+class FPVRegSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> {
+  let Name = "VRegWithFP"#opSize#"InputMods";
+  let PredicateMethod = "isVRegWithFP"#opSize#"InputMods";
+}
+
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 class FPT16InputModsMatchClass<bit IsFake16> : FPInputModsMatchClass<16> {
   let Name = !if(IsFake16, "RegOrImmWithFPFake16InputMods",
@@ -1345,6 +1356,10 @@ class FP16VCSrcInputModsMatchClass<bit IsFake16>
                         !if(IsFake16, "true", "false") # ">";
 }
 def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
+def FP64VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<64>;
+
+def FP32VRegSrcInputModsMatchClass : FPVRegSrcInputModsMatchClass<32>;
+def FP64VRegSrcInputModsMatchClass : FPVRegSrcInputModsMatchClass<64>;
 
 class InputMods <AsmOperandClass matchClass> : Operand <i32> {
   let OperandNamespace = "AMDGPU";
@@ -1371,6 +1386,10 @@ class FPT16VCSrcInputMods<bit IsFake16 = 1>
   let EncoderMethod = "getMachineOpValueT16";
 }
 def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
+def FP64VCSrcInputMods : FPInputMods<FP64VCSrcInputModsMatchClass>;
+
+def FP32VRegSrcInputMods : FPInputMods<FP32VRegSrcInputModsMatchClass>;
+def FP64VRegSrcInputMods : FPInputMods<FP64VRegSrcInputModsMatchClass>;
 
 class IntInputModsMatchClass <int opSize> : AsmOperandClass {
   let Name = "RegOrImmWithInt"#opSize#"InputMods";
@@ -1782,6 +1801,32 @@ class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> {
         1               : VSrc_b32);
 }
 
+// Returns the register class to use for source VGPR, SGPR or inline constant
+// for the given VT.
+class getVCSrcForVT<ValueType VT> {
+  RegisterOperand ret =
+    !if(VT.isFP,
+      !if(!eq(VT.Size, 64),
+         VCSrc_f64,
+         !cond(!eq(VT.Value, f16.Value)    : VCSrc_f16,
+               !eq(VT.Value, bf16.Value)   : VCSrc_bf16,
+               !eq(VT.Value, v2f16.Value)  : VCSrc_v2f16,
+               !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16,
+               1 : VCSrc_f32)
+       ),
+       !if(!eq(VT.Size, 64),
+          VCSrc_b64,
+          !if(!eq(VT.Value, i16.Value),
+             VCSrc_b16,
+             !if(!eq(VT.Value, v2i16.Value),
+                VCSrc_v2b16,
+                VCSrc_b32
+             )
+          )
+       )
+    );
+}
+
 class getSOPSrcForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
 }
@@ -1922,6 +1967,20 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
                 IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods));
 }
 
+// Return type of input modifiers operand for specified input operand for DPP
+// or VOPD3.
+class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
+  Operand ret =
+      !if (VT.isFP,
+           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+                FPT16VCSrcInputMods<IsFake16>,
+                !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods,
+                     FP32VCSrcInputMods)),
+           !if (!eq(VT.Value, i16.Value),
+                IntT16VCSrcInputMods<IsFake16>,
+                Int32VCSrcInputMods));
+}
+
 // Return type of input modifiers operand for specified input operand for DPP
 // True16: If the destination is a 16-bit value, the src0 modifier must hold
 // dst's opsel bit. Use a dummy value for DstVT if getting the mod for a src operand besides 0.
@@ -1943,16 +2002,12 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
   Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal);
 }
 
-// GFX11 only supports VGPR src1, but the restriction is done in AsmParser
-// and GCNDPPCombine.
-class getSrcModVOP3DPP<ValueType VT, bit IsFake16 = 1> {
+// Return type of input modifiers operand for specified input operand for VGPR
+// only operands (VOPD3 vsrc1 and vsrc2).
+class getSrcModVOP3V <ValueType VT> {
   Operand ret =
-      !if (VT.isFP,
-           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                FPT16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
-           !if (!eq(VT.Value, i16.Value),
-                IntT16VCSrcInputMods<IsFake16>,
-                Int32VCSrcInputMods));
+      !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods,
+           FP32VRegSrcInputMods);
 }
 
 // Return type of input modifiers operand specified input operand for SDWA
@@ -2185,6 +2240,27 @@ class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs
             (ins)/* endif */)));
 }
 
+// Ins for VOPD3
+class getInsVOPD3<RegisterOperand Src0VOPD3, RegisterOperand Src1VOPD3, RegisterOperand Src2VOPD3,
+                  Operand Src0Mod, Operand Src1Mod, Operand Src2Mod,
+                  bit HasSrc1, bit HasSrc2, bit HasModifiers, bit IsCompY> {
+  dag Src0 = !if(HasModifiers,
+                 !if(IsCompY, (ins Src0Mod:$src0Y_modifiers, Src0VOPD3:$src0Y),
+                              (ins Src0Mod:$src0X_modifiers, Src0VOPD3:$src0X)),
+                 !if(IsCompY, (ins Src0VOPD3:$src0Y), (ins Src0VOPD3:$src0X)));
+  dag Src1 = !if(HasModifiers,
+                 !if(IsCompY, (ins Src1Mod:$vsrc1Y_modifiers, Src1VOPD3:$vsrc1Y),
+                              (ins Src1Mod:$vsrc1X_modifiers, Src1VOPD3:$vsrc1X)),
+                 !if(IsCompY, (ins Src1VOPD3:$vsrc1Y), (ins Src1VOPD3:$vsrc1X)));
+  dag Src2 = !if(HasModifiers,
+                 !if(IsCompY, (ins Src2Mod:$vsrc2Y_modifiers, Src2VOPD3:$vsrc2Y),
+                              (ins Src2Mod:$vsrc2X_modifiers, Src2VOPD3:$vsrc2X)),
+                 !if(IsCompY, (ins Src2VOPD3:$vsrc2Y), (ins Src2VOPD3:$vsrc2X)));
+  dag ret = !con(Src0,
+                 !if(HasSrc1, Src1, (ins)),
+                 !if(HasSrc2, Src2, (ins)));
+}
+
 // Outs for DPP
 class getOutsDPP <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
   dag ret = !if(HasDst,
@@ -2216,13 +2292,16 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
                !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
 }
 
-class getAsmVOPDPart <int NumSrcArgs, string XorY> {
+class getAsmVOPDPart <int NumSrcArgs, string XorY, bit HasVOPD3Src2 = 0, bit HasModifiers = 0> {
+  string mods = !if(HasModifiers, "_modifiers", "");
   string dst = "$vdst" # XorY;
-  string src0 = ", $src0" # XorY;
-  string src1 = ", $vsrc1" # XorY;
+  string src0 = ", $src0" # XorY # mods;
+  string src1 = ", $vsrc1" # XorY # mods;
+  string src2 = ", $vsrc2" # XorY # mods;
   string ret = dst #
                !if(!ge(NumSrcArgs, 1), src0, "") #
-               !if(!ge(NumSrcArgs, 2), src1, "");
+               !if(!ge(NumSrcArgs, 2), src1, "") #
+               !if(HasVOPD3Src2, src2, "");
 }
 
 // Returns the assembly string for the inputs and outputs of a VOP3P
@@ -2515,10 +2594,16 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
   field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret;
   field Operand Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT>.ret;
-  field Operand Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT>.ret;
-  field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT>.ret;
+  field Operand Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT>.ret;
+  field Operand Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT>.ret;
   field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
+  field RegisterOperand Src0VOPD3 = getVCSrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1VOPD3 = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src2VOPD3 = getVregSrcForVT<Src2VT>.ret;
+  field Operand Src0ModVOPD3 = getSrcModVOP3VC<Src0VT>.ret;
+  field Operand Src1ModVOPD3 = getSrcModVOP3V<Src1VT>.ret;
+  field Operand Src2ModVOPD3 = getSrcModVOP3V<Src2VT>.ret;
 
 
   field bit IsMAI = 0;
@@ -2642,6 +2727,13 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // component is FMAAK or FMAMK
   field dag InsVOPDX_immX = (ins !if(!eq(Src0VT.Size, 32), VSrc_f32, VSrc_f16):$src0X, VGPR_32:$vsrc1X);
   field dag InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y);
+  field bit HasVOPD3Src2 = HasSrc2;
+  field dag InsVOPD3X = getInsVOPD3<Src0VOPD3, Src1VOPD3, Src2VOPD3,
+                                    Src0ModVOPD3, Src1ModVOPD3, Src2ModVOPD3,
+                                    HasSrc1, HasVOPD3Src2, HasModifiers, 0>.ret;
+  field dag InsVOPD3Y = getInsVOPD3<Src0VOPD3, Src1VOPD3, Src2VOPD3,
+                                    Src0ModVOPD3, Src1ModVOPD3, Src2ModVOPD3,
+                                    HasSrc1, HasVOPD3Src2, HasModifiers, 1>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string AsmDPP = !if(HasExtDPP,
@@ -2662,6 +2754,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
   field string AsmVOPDX = getAsmVOPDPart<NumSrcArgs, "X">.ret;
   field string AsmVOPDY = getAsmVOPDPart<NumSrcArgs, "Y">.ret;
+  field string AsmVOPD3X = getAsmVOPDPart<NumSrcArgs, "X", HasVOPD3Src2, HasModifiers>.ret;
+  field string AsmVOPD3Y = getAsmVOPDPart<NumSrcArgs, "Y", HasVOPD3Src2, HasModifiers>.ret;
   field string TieRegDPP = "$old";
   field bit IsSMFMAC = false;
   field bit HasAbid = !and(IsMAI, HasSrc1);
@@ -2705,8 +2799,8 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0 /*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
-  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0 /*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0 /*IsFake16*/>.ret;
 
   let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
@@ -2735,8 +2829,8 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1 /*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1 /*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1 /*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1 /*IsFake16*/>.ret;
 }
 
 def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>;
@@ -3224,7 +3318,7 @@ def FP4FP8DstByteSelTable : GenericTable {
 def VOPDComponentTable : GenericTable {
   let FilterClass = "VOPD_Component";
   let CppTypeName = "VOPDComponentInfo";
-  let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"];
+  let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX", "CanBeVOPD3X"];
   let PrimaryKey = ["BaseVOP"];
   let PrimaryKeyName = "getVOPDComponentHelper";
 }
@@ -3237,14 +3331,14 @@ def getVOPDBaseFromComponent : SearchIndex {
 def VOPDPairs : GenericTable {
   let FilterClass = "VOPD_Base";
   let CppTypeName = "VOPDInfo";
-  let Fields = ["Opcode", "OpX", "OpY", "SubTgt"];
+  let Fields = ["Opcode", "OpX", "OpY", "SubTgt", "VOPD3"];
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getVOPDOpcodeHelper";
 }
 
 def getVOPDInfoFromComponentOpcodes : SearchIndex {
   let Table = VOPDPairs;
-  let Key = ["OpX", "OpY", "SubTgt"];
+  let Key = ["OpX", "OpY", "SubTgt", "VOPD3"];
 }
 
 include "SIInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index a6c7b164c8b2c..d24c301fc1e51 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1267,6 +1267,8 @@ def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
 def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
 def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
 def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
+def VCSrc_b64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">;
 def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
 def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
 def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c7c4276e0e252..2472b76fcf02c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1764,6 +1764,27 @@ let OtherPredicates = [HasExportInsts] in
                 [(int_amdgcn_s_wait_kmcnt timm:$simm16)]>;
 } // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
 
+let SubtargetPredicate = isGFX1250Plus, hasSideEffects = 1 in {
+  def S_WAIT_ASYNCCNT :
+    SOPP_Pseudo<"s_wait_asynccnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_asynccnt timm:$simm16)]> {
+      let mayLoad = 1;
+      let mayStore = 1;
+      let maybeAtomic = 0;
+      let Uses = [ASYNCcnt];
+      let Defs = [ASYNCcnt];
+    }
+  def S_WAIT_TENSORCNT :
+    SOPP_Pseudo<"s_wait_tensorcnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_tensorcnt timm:$simm16)]> {
+      let mayLoad = 1;
+      let mayStore = 1;
+      let maybeAtomic = 0;
+      let Uses = [TENSORcnt];
+      let Defs = [TENSORcnt];
+    }
+} // End SubtargetPredicate = isGFX1250Plus, hasSideEffects = 1
+
 let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in {
   def S_WAIT_XCNT :
     SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
@@ -2609,6 +2630,8 @@ defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 //===----------------------------------------------------------------------===//
 defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
 defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
+defm S_WAIT_ASYNCCNT  : SOPP_Real_32_gfx12<0x04a>;
+defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12<0x04b>;
 
 //===----------------------------------------------------------------------===//
 // SOPP - GFX11, GFX12.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 10d80756943f5..a32078cc403e7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -392,6 +392,7 @@ struct VOPDComponentInfo {
   uint16_t BaseVOP;
   uint16_t VOPDOp;
   bool CanBeVOPDX;
+  bool CanBeVOPD3X;
 };
 
 struct VOPDInfo {
@@ -399,6 +400,7 @@ struct VOPDInfo {
   uint16_t OpX;
   uint16_t OpY;
   uint16_t Subtarget;
+  bool VOPD3;
 };
 
 struct VOPTrue16Info {
@@ -591,6 +593,8 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
 }
 
 unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
+  if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
+    return SIEncodingFamily::GFX1250;
   if (ST.hasFeature(AMDGPU::FeatureGFX12Insts))
     return SIEncodingFamily::GFX12;
   if (ST.hasFeature(AMDGPU::FeatureGFX11Insts))
@@ -598,14 +602,27 @@ unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
   llvm_unreachable("Subtarget generation does not support VOPD!");
 }
 
-CanBeVOPD getCanBeVOPD(unsigned Opc) {
+CanBeVOPD getCanBeVOPD(unsigned Opc, unsigned EncodingFamily, bool VOPD3) {
+  bool IsConvertibleToBitOp = VOPD3 ? getBitOp2(Opc) : 0;
+  Opc = IsConvertibleToBitOp ? (unsigned)AMDGPU::V_BITOP3_B32_e64 : Opc;
   const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
-  if (Info)
-    return {Info->CanBeVOPDX, true};
+  if (Info) {
+    // Check that Opc can be used as VOPDY for this encoding. V_MOV_B32 as a
+    // VOPDX is just a placeholder here, it is supported on all encodings.
+    // TODO: This can be optimized by creating tables of supported VOPDY
+    // opcodes per encoding.
+    unsigned VOPDMov = AMDGPU::getVOPDOpcode(AMDGPU::V_MOV_B32_e32, VOPD3);
+    bool CanBeVOPDY = getVOPDFull(VOPDMov, AMDGPU::getVOPDOpcode(Opc, VOPD3),
+                                  EncodingFamily, VOPD3) != -1;
+    return {VOPD3 ? Info->CanBeVOPD3X : Info->CanBeVOPDX, CanBeVOPDY};
+  }
+
   return {false, false};
 }
 
-unsigned getVOPDOpcode(unsigned Opc) {
+unsigned getVOPDOpcode(unsigned Opc, bool VOPD3) {
+  bool IsConvertibleToBitOp = VOPD3 ? getBitOp2(Opc) : 0;
+  Opc = IsConvertibleToBitOp ? (unsigned)AMDGPU::V_BITOP3_B32_e64 : Opc;
   const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
   return Info ? Info->VOPDOp : ~0u;
 }
@@ -742,9 +759,27 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
   return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 
-int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily) {
+unsigned getBitOp2(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return 0;
+  case AMDGPU::V_AND_B32_e32:
+    return 0x40;
+  case AMDGPU::V_OR_B32_e32:
+    return 0x54;
+  case AMDGPU::V_XOR_B32_e32:
+    return 0x14;
+  case AMDGPU::V_XNOR_B32_e32:
+    return 0x41;
+  }
+}
+
+int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily,
+                bool VOPD3) {
+  bool IsConvertibleToBitOp = VOPD3 ? getBitOp2(OpY) : 0;
+  OpY = IsConvertibleToBitOp ? (unsigned)AMDGPU::V_BITOP3_B32_e64 : OpY;
   const VOPDInfo *Info =
-      getVOPDInfoFromComponentOpcodes(OpX, OpY, EncodingFamily);
+      getVOPDInfoFromComponentOpcodes(OpX, OpY, EncodingFamily, VOPD3);
   return Info ? Info->Opcode : -1;
 }
 
@@ -759,7 +794,7 @@ std::pair<unsigned, unsigned> getVOPDComponents(unsigned VOPDOpcode) {
 
 namespace VOPD {
 
-ComponentProps::ComponentProps(const MCInstrDesc &OpDesc) {
+ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
   assert(OpDesc.getNumDefs() == Component::DST_NUM);
 
   assert(OpDesc.getOperandConstraint(Component::SRC0, MCOI::TIED_TO) == -1);
@@ -767,10 +802,34 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc) {
   auto TiedIdx = OpDesc.getOperandConstraint(Component::SRC2, MCOI::TIED_TO);
   assert(TiedIdx == -1 || TiedIdx == Component::DST);
   HasSrc2Acc = TiedIdx != -1;
+  Opcode = OpDesc.getOpcode();
 
-  SrcOperandsNum = OpDesc.getNumOperands() - OpDesc.getNumDefs();
+  IsVOP3 = VOP3Layout || (OpDesc.TSFlags & SIInstrFlags::VOP3);
+  SrcOperandsNum = AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2)   ? 3
+                   : AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::imm)  ? 3
+                   : AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1) ? 2
+                                                                           : 1;
   assert(SrcOperandsNum <= Component::MAX_SRC_NUM);
 
+  if (Opcode == AMDGPU::V_CNDMASK_B32_e32 ||
+      Opcode == AMDGPU::V_CNDMASK_B32_e64) {
+    // CNDMASK is an awkward exception, it has FP modifiers, but not FP
+    // operands.
+    NumVOPD3Mods = 2;
+    if (IsVOP3)
+      SrcOperandsNum = 3;
+  } else if (isSISrcFPOperand(OpDesc,
+                              getNamedOperandIdx(Opcode, OpName::src0))) {
+    // All FP VOPD instructions have Neg modifiers for all operands except
+    // for tied src2.
+    NumVOPD3Mods = SrcOperandsNum;
+    if (HasSrc2Acc)
+      --NumVOPD3Mods;
+  }
+
+  if (OpDesc.TSFlags & SIInstrFlags::VOP3)
+    return;
+
   auto OperandsNum = OpDesc.getNumOperands();
   unsigned CompOprIdx;
   for (CompOprIdx = Component::SRC1; CompOprIdx < OperandsNum; ++CompOprIdx) {
@@ -781,6 +840,10 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc) {
   }
 }
 
+int ComponentProps::getBitOp3OperandIdx() const {
+  return getNamedOperandIdx(Opcode, OpName::bitop3);
+}
+
 unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
   assert(CompOprIdx < Component::MAX_OPR_NUM);
 
@@ -796,19 +859,58 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
 }
 
 std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
-    std::function<unsigned(unsigned, unsigned)> GetRegIdx, bool SkipSrc) const {
+    std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+    const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
+    bool VOPD3) const {
+
+  auto OpXRegs = getRegIndices(ComponentIndex::X, GetRegIdx,
+                               CompInfo[ComponentIndex::X].isVOP3());
+  auto OpYRegs = getRegIndices(ComponentIndex::Y, GetRegIdx,
+                               CompInfo[ComponentIndex::Y].isVOP3());
+
+  const auto banksOverlap = [&MRI](MCRegister X, MCRegister Y,
+                                   unsigned BanksMask) -> bool {
+    MCRegister BaseX = MRI.getSubReg(X, AMDGPU::sub0);
+    MCRegister BaseY = MRI.getSubReg(Y, AMDGPU::sub0);
+    if (!BaseX)
+      BaseX = X;
+    if (!BaseY)
+      BaseY = Y;
+    if ((BaseX & BanksMask) == (BaseY & BanksMask))
+      return true;
+    if (BaseX != X /* This is 64-bit register */ &&
+        ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+      return true;
+    if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+      return true;
 
-  auto OpXRegs = getRegIndices(ComponentIndex::X, GetRegIdx);
-  auto OpYRegs = getRegIndices(ComponentIndex::Y, GetRegIdx);
+    // If both are 64-bit bank conflict will be detected yet while checking
+    // the first subreg.
+    return false;
+  };
 
-  const unsigned CompOprNum =
-      SkipSrc ? Component::DST_NUM : Component::MAX_OPR_NUM;
   unsigned CompOprIdx;
-  for (CompOprIdx = 0; CompOprIdx < CompOprNum; ++CompOprIdx) {
-    unsigned BanksMasks = VOPD_VGPR_BANK_MASKS[CompOprIdx];
-    if (OpXRegs[CompOprIdx] && OpYRegs[CompOprIdx] &&
-        ((OpXRegs[CompOprIdx] & BanksMasks) ==
-         (OpYRegs[CompOprIdx] & BanksMasks)))
+  for (CompOprIdx = 0; CompOprIdx < Component::MAX_OPR_NUM; ++CompOprIdx) {
+    unsigned BanksMasks = VOPD3 ? VOPD3_VGPR_BANK_MASKS[CompOprIdx]
+                                : VOPD_VGPR_BANK_MASKS[CompOprIdx];
+    if (!OpXRegs[CompOprIdx] || !OpYRegs[CompOprIdx])
+      continue;
+
+    if (SkipSrc && CompOprIdx >= Component::DST_NUM)
+      continue;
+
+    if (CompOprIdx < Component::DST_NUM) {
+      // Even if we do not check vdst parity, vdst operands still shall not
+      // overlap.
+      if (MRI.regsOverlap(OpXRegs[CompOprIdx], OpYRegs[CompOprIdx]))
+        return CompOprIdx;
+      if (VOPD3) // No need to check dst parity.
+        continue;
+    }
+
+    if (banksOverlap(OpXRegs[CompOprIdx], OpYRegs[CompOprIdx], BanksMasks) &&
+        (!AllowSameVGPR || CompOprIdx < Component::DST_NUM ||
+         OpXRegs[CompOprIdx] != OpYRegs[CompOprIdx]))
       return CompOprIdx;
   }
 
@@ -822,9 +924,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
 // GetRegIdx(Component, MCOperandIdx) must return a VGPR register index
 // for the specified component and MC operand. The callback must return 0
 // if the operand is not a register or not a VGPR.
-InstInfo::RegIndices InstInfo::getRegIndices(
-    unsigned CompIdx,
-    std::function<unsigned(unsigned, unsigned)> GetRegIdx) const {
+InstInfo::RegIndices
+InstInfo::getRegIndices(unsigned CompIdx,
+                        std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                        bool VOPD3) const {
   assert(CompIdx < COMPONENTS_NUM);
 
   const auto &Comp = CompInfo[CompIdx];
@@ -836,7 +939,8 @@ InstInfo::RegIndices InstInfo::getRegIndices(
     unsigned CompSrcIdx = CompOprIdx - DST_NUM;
     RegIndices[CompOprIdx] =
         Comp.hasRegSrcOperand(CompSrcIdx)
-            ? GetRegIdx(CompIdx, Comp.getIndexOfSrcInMCOperands(CompSrcIdx))
+            ? GetRegIdx(CompIdx,
+                        Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
             : 0;
   }
   return RegIndices;
@@ -853,8 +957,9 @@ VOPD::InstInfo getVOPDInstInfo(unsigned VOPDOpcode,
   auto [OpX, OpY] = getVOPDComponents(VOPDOpcode);
   const auto &OpXDesc = InstrInfo->get(OpX);
   const auto &OpYDesc = InstrInfo->get(OpY);
-  VOPD::ComponentInfo OpXInfo(OpXDesc, VOPD::ComponentKind::COMPONENT_X);
-  VOPD::ComponentInfo OpYInfo(OpYDesc, OpXInfo);
+  bool VOPD3 = InstrInfo->get(VOPDOpcode).TSFlags & SIInstrFlags::VOPD3;
+  VOPD::ComponentInfo OpXInfo(OpXDesc, VOPD::ComponentKind::COMPONENT_X, VOPD3);
+  VOPD::ComponentInfo OpYInfo(OpYDesc, OpXInfo, VOPD3);
   return VOPD::InstInfo(OpXInfo, OpYInfo);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 4f7d18170d586..6708e0a3f4549 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -593,6 +593,11 @@ bool getMAIIsDGEMM(unsigned Opc);
 LLVM_READONLY
 bool getMAIIsGFX940XDL(unsigned Opc);
 
+// Get an equivalent BitOp3 for a binary logical \p Opc.
+// \returns BitOp3 modifier for the logical operation or zero.
+// Used in VOPD3 conversion.
+unsigned getBitOp2(unsigned Opc);
+
 struct CanBeVOPD {
   bool X;
   bool Y;
@@ -603,7 +608,7 @@ LLVM_READONLY
 unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST);
 
 LLVM_READONLY
-CanBeVOPD getCanBeVOPD(unsigned Opc);
+CanBeVOPD getCanBeVOPD(unsigned Opc, unsigned EncodingFamily, bool VOPD3);
 
 LLVM_READNONE
 uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal);
@@ -626,10 +631,11 @@ LLVM_READONLY
 int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
 LLVM_READONLY
-unsigned getVOPDOpcode(unsigned Opc);
+unsigned getVOPDOpcode(unsigned Opc, bool VOPD3);
 
 LLVM_READONLY
-int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily);
+int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily,
+                bool VOPD3);
 
 LLVM_READONLY
 bool isVOPD(unsigned Opc);
@@ -662,6 +668,7 @@ enum Component : unsigned {
 // LSB mask for VGPR banks per VOPD component operand.
 // 4 banks result in a mask 3, setting 2 lower bits.
 constexpr unsigned VOPD_VGPR_BANK_MASKS[] = {1, 3, 3, 1};
+constexpr unsigned VOPD3_VGPR_BANK_MASKS[] = {1, 3, 3, 3};
 
 enum ComponentIndex : unsigned { X = 0, Y = 1 };
 constexpr unsigned COMPONENTS[] = {ComponentIndex::X, ComponentIndex::Y};
@@ -673,10 +680,13 @@ class ComponentProps {
   unsigned SrcOperandsNum = 0;
   unsigned MandatoryLiteralIdx = ~0u;
   bool HasSrc2Acc = false;
+  unsigned NumVOPD3Mods = 0;
+  unsigned Opcode = 0;
+  bool IsVOP3 = false;
 
 public:
   ComponentProps() = default;
-  ComponentProps(const MCInstrDesc &OpDesc);
+  ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout = false);
 
   // Return the total number of src operands this component has.
   unsigned getCompSrcOperandsNum() const { return SrcOperandsNum; }
@@ -706,6 +716,18 @@ class ComponentProps {
   // Return true iif this component has tied src2.
   bool hasSrc2Acc() const { return HasSrc2Acc; }
 
+  // Return a number of source modifiers if instruction is used in VOPD3.
+  unsigned getCompVOPD3ModsNum() const { return NumVOPD3Mods; }
+
+  // Return opcode of the component.
+  unsigned getOpcode() const { return Opcode; }
+
+  // Returns if component opcode is in VOP3 encoding.
+  unsigned isVOP3() const { return IsVOP3; }
+
+  // Return index of BitOp3 operand or -1.
+  int getBitOp3OperandIdx() const;
+
 private:
   bool hasMandatoryLiteralAt(unsigned CompSrcIdx) const {
     assert(CompSrcIdx < Component::MAX_SRC_NUM);
@@ -758,7 +780,15 @@ class ComponentLayout {
   //   dstX, dstY, src0X [, other OpX operands], src0Y [, other OpY operands]
   // Each ComponentKind has operand indices defined below.
   static constexpr unsigned MC_DST_IDX[] = {0, 0, 1};
-  static constexpr unsigned FIRST_MC_SRC_IDX[] = {1, 2, 2 /* + OpX.MCSrcNum */};
+
+  // VOPD3 instructions may have 2 or 3 source modifiers, src2 modifier is not
+  // used if there is tied accumulator. Indexing of this array:
+  // MC_SRC_IDX[VOPD3ModsNum][SrcNo]. This returns an index for a SINGLE
+  // instruction layout, add 1 for COMPONENT_X or COMPONENT_Y. For the second
+  // component add OpX.MCSrcNum + OpX.VOPD3ModsNum.
+  // For VOPD1/VOPD2 use column with zero modifiers.
+  static constexpr unsigned SINGLE_MC_SRC_IDX[4][3] = {
+      {1, 2, 3}, {2, 3, 4}, {2, 4, 5}, {2, 4, 6}};
 
   // Parsed operands of regular instructions are ordered as follows:
   //   Mnemo dst src0 [vsrc1 ...]
@@ -774,25 +804,40 @@ class ComponentLayout {
 private:
   const ComponentKind Kind;
   const ComponentProps PrevComp;
+  const unsigned VOPD3ModsNum;
+  const int BitOp3Idx; // Index of bitop3 operand or -1
 
 public:
   // Create layout for COMPONENT_X or SINGLE component.
-  ComponentLayout(ComponentKind Kind) : Kind(Kind) {
+  ComponentLayout(ComponentKind Kind, unsigned VOPD3ModsNum, int BitOp3Idx)
+      : Kind(Kind), VOPD3ModsNum(VOPD3ModsNum), BitOp3Idx(BitOp3Idx) {
     assert(Kind == ComponentKind::SINGLE || Kind == ComponentKind::COMPONENT_X);
   }
 
   // Create layout for COMPONENT_Y which depends on COMPONENT_X layout.
-  ComponentLayout(const ComponentProps &OpXProps)
-      : Kind(ComponentKind::COMPONENT_Y), PrevComp(OpXProps) {}
+  ComponentLayout(const ComponentProps &OpXProps, unsigned VOPD3ModsNum,
+                  int BitOp3Idx)
+      : Kind(ComponentKind::COMPONENT_Y), PrevComp(OpXProps),
+        VOPD3ModsNum(VOPD3ModsNum), BitOp3Idx(BitOp3Idx) {}
 
 public:
   // Return the index of dst operand in MCInst operands.
   unsigned getIndexOfDstInMCOperands() const { return MC_DST_IDX[Kind]; }
 
   // Return the index of the specified src operand in MCInst operands.
-  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx) const {
+  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx, bool VOPD3) const {
     assert(CompSrcIdx < Component::MAX_SRC_NUM);
-    return FIRST_MC_SRC_IDX[Kind] + getPrevCompSrcNum() + CompSrcIdx;
+
+    if (Kind == SINGLE && CompSrcIdx == 2 && BitOp3Idx != -1)
+      return BitOp3Idx;
+
+    if (VOPD3) {
+      return SINGLE_MC_SRC_IDX[VOPD3ModsNum][CompSrcIdx] + getPrevCompSrcNum() +
+             getPrevCompVOPD3ModsNum() + (Kind != SINGLE ? 1 : 0);
+    }
+
+    return SINGLE_MC_SRC_IDX[0][CompSrcIdx] + getPrevCompSrcNum() +
+           (Kind != SINGLE ? 1 : 0);
   }
 
   // Return the index of dst operand in the parsed operands array.
@@ -813,19 +858,27 @@ class ComponentLayout {
   unsigned getPrevCompParsedSrcNum() const {
     return PrevComp.getCompParsedSrcOperandsNum();
   }
+  unsigned getPrevCompVOPD3ModsNum() const {
+    return PrevComp.getCompVOPD3ModsNum();
+  }
 };
 
 // Layout and properties of VOPD components.
-class ComponentInfo : public ComponentLayout, public ComponentProps {
+class ComponentInfo : public ComponentProps, public ComponentLayout {
 public:
   // Create ComponentInfo for COMPONENT_X or SINGLE component.
   ComponentInfo(const MCInstrDesc &OpDesc,
-                ComponentKind Kind = ComponentKind::SINGLE)
-      : ComponentLayout(Kind), ComponentProps(OpDesc) {}
+                ComponentKind Kind = ComponentKind::SINGLE,
+                bool VOP3Layout = false)
+      : ComponentProps(OpDesc, VOP3Layout),
+        ComponentLayout(Kind, getCompVOPD3ModsNum(), getBitOp3OperandIdx()) {}
 
   // Create ComponentInfo for COMPONENT_Y which depends on COMPONENT_X layout.
-  ComponentInfo(const MCInstrDesc &OpDesc, const ComponentProps &OpXProps)
-      : ComponentLayout(OpXProps), ComponentProps(OpDesc) {}
+  ComponentInfo(const MCInstrDesc &OpDesc, const ComponentProps &OpXProps,
+                bool VOP3Layout = false)
+      : ComponentProps(OpDesc, VOP3Layout),
+        ComponentLayout(OpXProps, getCompVOPD3ModsNum(),
+                        getBitOp3OperandIdx()) {}
 
   // Map component operand index to parsed operand index.
   // Return 0 if the specified operand does not exist.
@@ -857,23 +910,36 @@ class InstInfo {
   // if the operand is not a register or not a VGPR.
   // If \p SkipSrc is set to true then constraints for source operands are not
   // checked.
+  // If \p AllowSameVGPR is set then same VGPRs are allowed for X and Y sources
+  // even though it violates requirement to be from different banks.
+  // If \p VOPD3 is set to true both dst registers allowed to be either odd
+  // or even and instruction may have real src2 as opposed to tied accumulator.
   bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
-                         bool SkipSrc = false) const {
-    return getInvalidCompOperandIndex(GetRegIdx, SkipSrc).has_value();
+                         const MCRegisterInfo &MRI, bool SkipSrc = false,
+                         bool AllowSameVGPR = false, bool VOPD3 = false) const {
+    return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
+                                      VOPD3)
+        .has_value();
   }
 
   // Check VOPD operands constraints.
   // Return the index of an invalid component operand, if any.
   // If \p SkipSrc is set to true then constraints for source operands are not
-  // checked.
+  // checked except for being from the same halves of VGPR file on gfx1250.
+  // If \p AllowSameVGPR is set then same VGPRs are allowed for X and Y sources
+  // even though it violates requirement to be from different banks.
+  // If \p VOPD3 is set to true both dst registers allowed to be either odd
+  // or even and instruction may have real src2 as opposed to tied accumulator.
   std::optional<unsigned> getInvalidCompOperandIndex(
       std::function<unsigned(unsigned, unsigned)> GetRegIdx,
-      bool SkipSrc = false) const;
+      const MCRegisterInfo &MRI, bool SkipSrc = false,
+      bool AllowSameVGPR = false, bool VOPD3 = false) const;
 
 private:
   RegIndices
   getRegIndices(unsigned ComponentIdx,
-                std::function<unsigned(unsigned, unsigned)> GetRegIdx) const;
+                std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                bool VOPD3) const;
 };
 
 } // namespace VOPD
@@ -1537,6 +1603,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+  case AMDGPU::OPERAND_KIMM64:
     return 8;
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2c0871347ebb9..211112e5262a3 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -892,6 +892,8 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
     VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let DecoderNamespace = Gen.DecoderNamespace;
+  let OtherPredicates = !listconcat(ps.OtherPredicates,
+                                    !if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
 }
 
 class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -967,7 +969,8 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
 
 multiclass VOP1_Real_dpp8<GFXGen Gen, bits<9> op, string opName = NAME> {
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
-  def _dpp8#Gen.Suffix : VOP1_DPP8_Gen<op{7-0}, ps, Gen>;
+  if !not(ps.Pfl.HasExt64BitDPP) then
+    def _dpp8#Gen.Suffix : VOP1_DPP8_Gen<op{7-0}, ps, Gen>;
 }
 
 multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
@@ -976,7 +979,8 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
   let AsmString = asmName # ps.Pfl.AsmDPP8,
       DecoderNamespace = Gen.DecoderNamespace #
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
-    defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
+    if !not(ps.Pfl.HasExt64BitDPP) then
+      defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 859d5bae3d460..25c6cbc3e1ab5 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -36,6 +36,20 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
   let Inst{63-32} = imm;
 }
 
+class VOP2_MADK64e <bits<6> op, VOPProfile P> : Enc96 {
+  bits<8>  vdst;
+  bits<9>  src0;
+  bits<8>  src1;
+  bits<64> imm;
+
+  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; // encoding
+  let Inst{95-32} = imm;
+}
+
 class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
   bits<8> src1;
@@ -148,7 +162,7 @@ multiclass VOP2Inst_e32<string opName,
                Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
 }
 multiclass
-    VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<5> VOPDOp,
+    VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<6> VOPDOp,
                       string VOPDName, SDPatternOperator node = null_frag,
                       string revOp = opName> {
   defm NAME : VOP2Inst_e32<opName, P, node, revOp>,
@@ -167,6 +181,15 @@ multiclass VOP2Inst_e64<string opName,
     } // End SubtargetPredicate = isGFX11Plus
 }
 
+multiclass VOP2Inst_e64_VOPD<string opName,
+                             VOPProfile P, bits<6> VOPDOp,
+                             string VOPDName,
+                             SDPatternOperator node = null_frag,
+                             string revOp = opName> {
+  defm NAME: VOP2Inst_e64<opName, P, node, revOp>,
+             VOPD_Component<VOPDOp, VOPDName>;
+}
+
 multiclass VOP2Inst_sdwa<string opName,
                          VOPProfile P,
                          string revOp = opName> {
@@ -227,12 +250,12 @@ multiclass VOP2Inst_e64_t16<string opName,
 
 multiclass VOP2Inst_VOPD<string opName,
                          VOPProfile P,
-                         bits<5> VOPDOp,
+                         bits<6> VOPDOp,
                          string VOPDName,
                          SDPatternOperator node = null_frag,
                          string revOp = opName> :
     VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp>,
-    VOP2Inst_e64<opName, P, node, revOp>,
+    VOP2Inst_e64_VOPD<opName, P, VOPDOp, VOPDName, node, revOp>,
     VOP2Inst_sdwa<opName, P, revOp> {
     if P.HasExtDPP then
       def _dpp  : VOP2_DPP_Pseudo <opName, P>;
@@ -288,7 +311,7 @@ multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> {
 }
 
 multiclass
-    VOP2eInst_Base<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+    VOP2eInst_Base<string opName, VOPProfile P, bits<6> VOPDOp, string VOPDName,
                    SDPatternOperator node, string revOp, bit useSGPRInput> {
 
   let SchedRW = [Write32Bit] in {
@@ -310,9 +333,14 @@ multiclass
         def _dpp  : VOP2_DPP_Pseudo <opName, P>;
     }
 
-    def _e64 : VOP3InstBase <opName, P, node, 1>,
-               Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
-      let isReMaterializable = 1;
+    let isReMaterializable = 1 in {
+    if !empty(VOPDName) then
+      def _e64 : VOP3InstBase <opName, P, node, 1>,
+                 Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+    else
+      def _e64 : VOP3InstBase <opName, P, node, 1>,
+                 Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
+                 VOPD_Component<VOPDOp, VOPDName>;
     }
 
     let SubtargetPredicate = isGFX11Plus in {
@@ -328,7 +356,7 @@ multiclass
     : VOP2eInst_Base<opName, P, 0, "", node, revOp, useSGPRInput>;
 
 multiclass
-    VOP2eInst_VOPD<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName,
+    VOP2eInst_VOPD<string opName, VOPProfile P, bits<6> VOPDOp, string VOPDName,
                    SDPatternOperator node = null_frag, string revOp = opName,
                    bit useSGPRInput = !eq(P.NumSrcArgs, 3)>
     : VOP2eInst_Base<opName, P, VOPDOp, VOPDName, node, revOp, useSGPRInput>;
@@ -361,10 +389,14 @@ class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 }
 
 class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
-  field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16);
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32,
+                            !if(!eq(vt.Size, 64), KImmFP64,
+                                                  KImmFP16));
   field dag Ins32 = !if(!eq(vt.Size, 32),
                         (ins VSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
-                        (ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
+                        !if(!eq(vt.Size, 64),
+                            (ins VSrc_f64:$src0, VReg_64:$src1, ImmOpType:$imm),
+                            (ins VSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm)));
   field dag InsVOPDX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm);
   let InsVOPDX_immX = (ins VSrc_f32:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immX);
   field dag InsVOPDY = (ins VSrc_f32:$src0Y, VGPR_32:$vsrc1Y, ImmOpType:$imm);
@@ -390,12 +422,17 @@ def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
   let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm);
 }
 def VOP_MADAK_F32 : VOP_MADAK <f32>;
+def VOP_MADAK_F64 : VOP_MADAK <f64>;
 
 class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
-  field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16);
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32,
+                            !if(!eq(vt.Size, 64), KImmFP64,
+                                                  KImmFP16));
   field dag Ins32 = !if(!eq(vt.Size, 32),
                         (ins VSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1),
-                        (ins VSrc_f16:$src0, ImmOpType:$imm, VGPR_32:$src1));
+                        !if(!eq(vt.Size, 64),
+                            (ins VSrc_f64:$src0, ImmOpType:$imm, VReg_64:$src1),
+                            (ins VSrc_f16:$src0, ImmOpType:$imm, VGPR_32:$src1)));
   field dag InsVOPDX = (ins VSrc_f32:$src0X, ImmOpType:$imm, VGPR_32:$vsrc1X);
   let InsVOPDX_immX = (ins VSrc_f32:$src0X, ImmOpType:$immX, VGPR_32:$vsrc1X);
   field dag InsVOPDY = (ins VSrc_f32:$src0Y, ImmOpType:$imm, VGPR_32:$vsrc1Y);
@@ -421,6 +458,7 @@ def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
   let Ins32 = (ins VSrcFake16_f16_Lo128:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1);
 }
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
+def VOP_MADMK_F64 : VOP_MADMK <f64>;
 
 // Returns the vreg register class to use for sources of VOP3 instructions for the
 // given VT.
@@ -458,6 +496,12 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
   // We need a dummy src2 tied to dst to track the use of that register for s_delay_alu
   let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPRSrc_32:$src2X);
   let InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y, VGPRSrc_32:$src2Y);
+  let InsVOPD3X = (ins Src0ModVOPD3:$src0X_modifiers, Src0VOPD3:$src0X,
+                       Src1ModVOPD3:$vsrc1X_modifiers, Src1RC32:$vsrc1X,
+                       VGPRSrc_32:$src2X);
+  let InsVOPD3Y = (ins Src0ModVOPD3:$src0Y_modifiers, Src0VOPD3:$src0Y,
+                       Src1ModVOPD3:$vsrc1Y_modifiers, Src1RC32:$vsrc1Y,
+                       VGPRSrc_32:$src2Y);
 
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
@@ -482,6 +526,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
                         0 /*Src2HasMods*/, DstVT>.ret;
   let HasSrc2 = 0;
   let HasSrc2Mods = 0;
+  let HasVOPD3Src2 = 0;
 
   let HasExt = 1;
   let HasExtDPP = 1;
@@ -522,8 +567,8 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
-  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0/*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0/*IsFake16*/>.ret;
   let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
   let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
   let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
@@ -554,8 +599,8 @@ def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
   let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1/*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1/*IsFake16*/>.ret;
   let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
@@ -724,7 +769,14 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
   let HasExtSDWA9 = 1;
 }
 
-def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
+def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]> {
+  let Src2VOPD3 = SSrc_i1;
+  let InsVOPD3X = (ins FP32VCSrcInputMods:$src0X_modifiers, Src0VOPD3:$src0X, FP32VRegSrcInputMods:$vsrc1X_modifiers, Src1VOPD3:$vsrc1X, Src2VOPD3:$vsrc2X);
+  let InsVOPD3Y = (ins FP32VCSrcInputMods:$src0Y_modifiers, Src0VOPD3:$src0Y, FP32VRegSrcInputMods:$vsrc1Y_modifiers, Src1VOPD3:$vsrc1Y, Src2VOPD3:$vsrc2Y);
+  let AsmVOPD3X = "$vdstX, $src0X_modifiers, $vsrc1X_modifiers, $vsrc2X";
+  let AsmVOPD3Y = "$vdstY, $src0Y_modifiers, $vsrc1Y_modifiers, $vsrc2Y";
+  let HasVOPD3Src2 = 0;
+}
 def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
 // V_CNDMASK_B16 is VOP3 only
 def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
@@ -745,7 +797,7 @@ def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let Src0VOP3DPP = VGPRSrc_16;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 0/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 0/*IsFake16*/>.ret;
 }
 def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let IsTrue16 = 1;
@@ -757,7 +809,7 @@ def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let Src0VOP3DPP = VGPRSrc_32;
   let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
   let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret;
-  let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3VC<f16, 1/*IsFake16*/>.ret;
 }
 
 def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
@@ -819,12 +871,12 @@ defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
 defm V_MIN_F32 : VOP2Inst_VOPD <"v_min_f32", VOP_F32_F32_F32, 0xb, "v_min_f32", fminnum_like>;
 defm V_MAX_F32 : VOP2Inst_VOPD <"v_max_f32", VOP_F32_F32_F32, 0xa, "v_max_f32", fmaxnum_like>;
-defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
-defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
+defm V_MIN_I32 : VOP2Inst_VOPD <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x18, "v_min_i32", smin>;
+defm V_MAX_I32 : VOP2Inst_VOPD <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x17, "v_max_i32", smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
 defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
-defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">;
-defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">;
+defm V_LSHRREV_B32 : VOP2Inst_VOPD <"v_lshrrev_b32", VOP_I32_I32_I32, 0x15, "v_lshrrev_b32", clshr_rev_32, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst_VOPD <"v_ashrrev_i32", VOP_I32_I32_I32, 0x16, "v_ashrrev_i32", cashr_rev_32, "v_ashr_i32">;
 defm V_LSHLREV_B32 : VOP2Inst_VOPD <"v_lshlrev_b32", VOP_I32_I32_I32, 0x11, "v_lshlrev_b32", clshl_rev_32, "v_lshl_b32">;
 defm V_AND_B32 : VOP2Inst_VOPD <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x12, "v_and_b32", and>;
 defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
@@ -856,7 +908,7 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
 
 
 let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
-  defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
+  defm V_SUB_U32 : VOP2Inst_VOPD <"v_sub_u32", VOP_I32_I32_I32_ARITH, 0x14, "v_sub_nc_u32", null_frag, "v_sub_u32">;
   defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32">;
 }
 
@@ -1261,12 +1313,20 @@ let AddedComplexity = 30 in {
   }
 } // End AddedComplexity = 30
 
-let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1 in {
+let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, CanBeVOPD3X = 0, FixedSize = 1 in {
 def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">;
 
 let isCommutable = 1 in
 def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">;
-} // End SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1
+} // End SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, CanBeVOPD3X = 0, FixedSize = 1
+
+let SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1,
+    FixedSize = 1, Size = 12, SchedRW = [Write64Bit] in {
+def V_FMAMK_F64 : VOP2_Pseudo<"v_fmamk_f64", VOP_MADMK_F64, [], "">;
+
+let isCommutable = 1 in
+def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">;
+} // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit]
 
 let SubtargetPredicate = HasPkFmacF16Inst in {
 defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
@@ -1390,12 +1450,10 @@ def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
 }
 
 let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
-  let SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
-    let FPDPRounding = 1 in {
-      defm V_ADD_F64_pseudo : VOP2Inst <"v_add_f64_pseudo", VOP_F64_F64_F64, any_fadd>;
-      defm V_MUL_F64_pseudo : VOP2Inst <"v_mul_f64_pseudo", VOP_F64_F64_F64, fmul>;
-    } // End FPDPRounding = 1
-  } // End SchedRW = [WriteDoubleAdd], isCommutable = 1
+  let SchedRW = [WriteDoubleAdd], isCommutable = 1, FPDPRounding = 1 in {
+    defm V_ADD_F64_pseudo : VOP2Inst_VOPD <"v_add_f64_pseudo", VOP_F64_F64_F64, 0x21, "v_add_f64", any_fadd>;
+    defm V_MUL_F64_pseudo : VOP2Inst_VOPD <"v_mul_f64_pseudo", VOP_F64_F64_F64, 0x22, "v_mul_f64", fmul>;
+  } // End SchedRW = [WriteDoubleAdd], isCommutable = 1, FPDPRounding = 1
   let SchedRW = [Write64Bit] in {
     defm V_LSHLREV_B64_pseudo : VOP2Inst <"v_lshlrev_b64_pseudo", VOP_I64_I32_I64, clshl_rev_64>;
   } // End SchedRW = [Write64Bit]
@@ -1403,8 +1461,8 @@ let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
 
 let SubtargetPredicate = HasIEEEMinimumMaximumInsts, isReMaterializable = 1,
     SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
-  defm V_MIN_NUM_F64 : VOP2Inst <"v_min_num_f64", VOP_F64_F64_F64, fminnum_like>;
-  defm V_MAX_NUM_F64 : VOP2Inst <"v_max_num_f64", VOP_F64_F64_F64, fmaxnum_like>;
+  defm V_MIN_NUM_F64 : VOP2Inst_VOPD <"v_min_num_f64", VOP_F64_F64_F64, 0x24, "v_min_num_f64", fminnum_like>;
+  defm V_MAX_NUM_F64 : VOP2Inst_VOPD <"v_max_num_f64", VOP_F64_F64_F64, 0x23, "v_max_num_f64", fmaxnum_like>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1492,6 +1550,14 @@ multiclass VOP2Only_Real_MADK<GFXGen Gen, bits<6> op> {
     VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
 }
 
+multiclass VOP2Only_Real_MADK64<GFXGen Gen, bits<6> op> {
+  def Gen.Suffix :
+    VOP2_Real_Gen<!cast<VOP2_Pseudo>(NAME), Gen>,
+    VOP2_MADK64e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl> {
+      let DecoderNamespace = Gen.DecoderNamespace;
+  }
+}
+
 multiclass VOP2Only_Real_MADK_with_name<GFXGen Gen, bits<6> op, string asmName,
                                         string opName = NAME> {
   def Gen.Suffix :
@@ -1766,6 +1832,9 @@ let SubtargetPredicate = isGFX12Plus in {
     V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">;
 } // End SubtargetPredicate = isGFX12Plus
 
+defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
+defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
+
 //===----------------------------------------------------------------------===//
 // GFX11.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index e7ebc109b5dd5..75c531913ded1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -51,6 +51,8 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
 
 def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
 
+def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
+
 def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> {
   let HasExtVOP3DPP = 0;
   let HasExtDPP = 0;
@@ -147,12 +149,12 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
 
 defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
+defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
 defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
 let FPDPRounding = 1 in {
-defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
 let SubtargetPredicate = isNotGFX12Plus in {
 defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
 defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>;
@@ -1033,7 +1035,11 @@ class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f>
   let HasClamp = 0;
   let HasOMod = 0;
   let HasModifiers = 0;
+  let HasVOPD3Src2 = 0;
   let HasBitOp3 = 1;
+
+  let InsVOPD3Y = (ins Src0VOPD3:$src0Y, Src1VOPD3:$vsrc1Y, bitop3_0:$bitop3);
+  let AsmVOPD3Y = getAsmVOPDPart<NumSrcArgs, "Y", HasVOPD3Src2, HasModifiers>.ret # "$bitop3";
 }
 
 class VOP3_CVT_SCALE_F1632_FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1416,7 +1422,8 @@ let SubtargetPredicate = HasBitOp3Insts  in {
     defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16",
                                   VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>;
     defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32",
-                                  VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>;
+                                  VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>,
+                        VOPD_Component<0x12, "v_bitop2_b32">;
   }
   def : GCNPat<
     (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)),
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 9e84f6aed0176..2c1193509da9b 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -112,8 +112,8 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
     let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16: VOPC_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
@@ -138,8 +138,8 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
     let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1/*IsFake16*/>.ret;
   }
 }
 
@@ -184,8 +184,8 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
     let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16 : VOPC_NoSdst_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
@@ -208,8 +208,8 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
     let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1/*IsFake16*/>.ret;
   }
 }
 
@@ -929,8 +929,8 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
     let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16 : VOPC_Class_Profile_Base<sched, f16, f16> {
     let IsTrue16 = 1;
@@ -955,8 +955,8 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
     let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1/*IsFake16*/>.ret;
   }
 }
 
@@ -998,8 +998,8 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
     let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 0/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 0/*IsFake16*/>.ret;
   }
   def _fake16 : VOPC_Class_NoSdst_Profile<sched, f16, f16> {
     let IsTrue16 = 1;
@@ -1022,8 +1022,8 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
     let Src1Mod = getSrcMod<Src1VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src2Mod = getSrcMod<Src2VT, 0/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
-    let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
-    let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+    let Src1ModVOP3DPP = getSrcModVOP3VC<Src1VT, 1/*IsFake16*/>.ret;
+    let Src2ModVOP3DPP = getSrcModVOP3VC<Src2VT, 1/*IsFake16*/>.ret;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
index 4054002c41478..3e7af12f6b600 100644
--- a/llvm/lib/Target/AMDGPU/VOPDInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -50,6 +50,47 @@ class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 {
   let Inst{95-64} = imm;
 }
 
+class VOPD3e<bits<6> opX, bits<6> opY, VOP_Pseudo VDX, VOP_Pseudo VDY> : Enc96 {
+  bits<9> src0X;
+  bits<8> vsrc1X;
+  bits<8> vsrc2X;
+  bits<8> vdstX;
+  bits<9> src0Y;
+  bits<8> vsrc1Y;
+  bits<8> vsrc2Y;
+  bits<8> vdstY;
+  // neg modifiers
+  bit src0X_modifiers;
+  bit src0Y_modifiers;
+  bit vsrc1X_modifiers;
+  bit vsrc1Y_modifiers;
+  bit vsrc2X_modifiers;
+  bit vsrc2Y_modifiers;
+  bits<8> bitop3;
+
+  let Inst{8-0} = src0X;
+  let Inst{17-12} = opY;
+  let Inst{23-18} = opX;
+  let Inst{31-24} = 0xcf; // encoding
+  let Inst{40-32} = src0Y;
+  let Inst{41} = !if(VDX.Pfl.HasModifiers, src0X_modifiers, 0);
+  let Inst{42} = !if(!and(VDX.Pfl.HasSrc1, VDX.Pfl.HasModifiers), vsrc1X_modifiers, 0);
+  let Inst{43} = !if(!and(VDX.Pfl.HasVOPD3Src2, VDX.Pfl.HasModifiers), vsrc2X_modifiers, 0);
+  let Inst{44} = !if(VDY.Pfl.HasModifiers, src0Y_modifiers, 0);
+  let Inst{45} = !if(!and(VDY.Pfl.HasSrc1, VDY.Pfl.HasModifiers), vsrc1Y_modifiers, 0);
+  let Inst{46} = !if(!and(VDY.Pfl.HasVOPD3Src2, VDY.Pfl.HasModifiers), vsrc2Y_modifiers, 0);
+  let Inst{55-48} = !if(!eq(!find(VDX.Pfl.AsmVOPD3X, "$vsrc1X"), -1), 0, vsrc1X);
+
+  // Despite the vsrc operand name, SGPRs can be used for vsrc2X for
+  // V_DUAL_CNDMASK_B32
+  let Inst{63-56} = !if(!eq(!find(VDX.Pfl.AsmVOPD3X, "$vsrc2X"), -1), 0, vsrc2X);
+  let Inst{71-64} = vdstX;
+  let Inst{79-72} = !if(!eq(!find(VDY.Pfl.AsmVOPD3Y, "$vsrc1Y"), -1), 0, vsrc1Y);
+  let Inst{87-80} = !if(!ne(!find(VDY.Pfl.AsmVOPD3Y, "bitop"), -1), bitop3,
+                        !if(!eq(!find(VDY.Pfl.AsmVOPD3Y, "$vsrc2Y"), -1), 0, vsrc2Y));
+  let Inst{95-88} = vdstY;
+}
+
 //===----------------------------------------------------------------------===//
 // VOPD classes
 //===----------------------------------------------------------------------===//
@@ -71,8 +112,8 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
       SIMCInstr<NAME, Gen.Subtarget> {
   // Fields for table indexing
   Instruction Opcode = !cast<Instruction>(NAME);
-  bits<5> OpX = XasVC.VOPDOp;
-  bits<5> OpY = YasVC.VOPDOp;
+  bits<6> OpX = XasVC.VOPDOp;
+  bits<6> OpY = YasVC.VOPDOp;
   bits<4> SubTgt = Gen.Subtarget;
 
   let VALU = 1;
@@ -110,7 +151,7 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
 class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
            VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
     : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>,
-      VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+      VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp{4-0}> {
   let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
   let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
 }
@@ -118,29 +159,48 @@ class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
 class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
                 VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
     : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>,
-      VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
+      VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp{4-0}> {
   let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
   let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
   let Size = 12;
   let FixedSize = 1;
 }
 
+class VOPD3<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
+            VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
+    : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>,
+      VOPD3e<XasVC.VOPDOp, YasVC.VOPDOp, VDX, VDY> {
+  let VOPD3 = 1;
+  let Size = 12;
+  // VOPD3 uses promoted form of VOP2 instructions, so V_CNDMASK_B32 is not
+  // limited to VCC src2 only, and a real SGPR will be used as an operand
+  // instead.
+  defvar UsesX = !if(!eq(VDX, V_CNDMASK_B32_e32), !filter(x, VDX.Uses, !ne(x, VCC)), VDX.Uses);
+  defvar UsesY = !if(!eq(VDY, V_CNDMASK_B32_e32), !filter(x, VDY.Uses, !ne(x, VCC)), VDY.Uses);
+  let Uses = RegListUnion<UsesX, UsesY>.ret;
+}
+
 defvar VOPDPseudosCommon = [
   "V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32",
   "V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32",
   "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32",
   "V_DOT2C_F32_F16_e32", "V_DOT2C_F32_BF16_e32"
 ];
-defvar VOPDYOnlyPseudosCommon = ["V_ADD_U32_e32", "V_LSHLREV_B32_e32",
-                                 "V_AND_B32_e32"];
+defvar VOPDYOnlyPseudosCommon = ["V_ADD_U32_e32", "V_LSHLREV_B32_e32"];
+defvar VOPDYOnlyPseudosGFX11_12 = ["V_AND_B32_e32"];
+defvar VOPDYOnlyPseudosGFX1250 = ["V_MAX_I32_e32", "V_MIN_I32_e32",
+                                  "V_SUB_U32_e32", "V_LSHRREV_B32_e32",
+                                  "V_ASHRREV_I32_e32"];
 
 defvar VOPDXPseudosGFX11 = VOPDPseudosCommon;
 defvar VOPDXPseudosGFX12 = VOPDPseudosCommon;
-defvar VOPDYPseudosGFX11 = !listconcat(VOPDXPseudosGFX11, VOPDYOnlyPseudosCommon);
-defvar VOPDYPseudosGFX12 = !listconcat(VOPDXPseudosGFX12, VOPDYOnlyPseudosCommon);
+defvar VOPDYPseudosGFX11 = !listconcat(VOPDXPseudosGFX11, VOPDYOnlyPseudosCommon, VOPDYOnlyPseudosGFX11_12);
+defvar VOPDYPseudosGFX12 = !listconcat(VOPDXPseudosGFX12, VOPDYOnlyPseudosCommon, VOPDYOnlyPseudosGFX11_12);
+defvar VOPDYPseudosGFX1250 = !listconcat(VOPDXPseudosGFX12, VOPDYOnlyPseudosCommon, VOPDYOnlyPseudosGFX1250);
 
 def GFX11GenD : GFXGenD<GFX11Gen, VOPDXPseudosGFX11, VOPDYPseudosGFX11>;
-def GFX12GenD : GFXGenD<GFX12Gen, VOPDXPseudosGFX12, VOPDYPseudosGFX12>;
+def GFX12GenD : GFXGenD<GFX12Not12_50Gen, VOPDXPseudosGFX12, VOPDYPseudosGFX12>;
+def GFX1250GenD : GFXGenD<GFX1250Gen, VOPDXPseudosGFX12, VOPDYPseudosGFX1250>;
 
 
 def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> {
@@ -148,16 +208,13 @@ def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> {
 }
 
 class getRenamed<string VOPDName, GFXGen Gen> {
-  string ret = !if(!eq(Gen.Subtarget, GFX12Gen.Subtarget),
-                   !if(!eq(VOPDName, "v_dual_max_f32"),
-                       "v_dual_max_num_f32",
-                       !if(!eq(VOPDName, "v_dual_min_f32"),
-                           "v_dual_min_num_f32",
-                           VOPDName)),
-                   VOPDName);
+  string ret = !cond(!eq(Gen.Subtarget, GFX11Gen.Subtarget) : VOPDName,
+                     !eq(VOPDName, "v_dual_max_f32")        : "v_dual_max_num_f32",
+                     !eq(VOPDName, "v_dual_min_f32")        : "v_dual_min_num_f32",
+                     true                                   : VOPDName);
 }
 
-foreach Gen = [GFX11GenD, GFX12GenD] in {
+foreach Gen = [GFX11GenD, GFX12GenD, GFX1250GenD] in {
   foreach x = Gen.VOPDXPseudos in {
     foreach y = Gen.VOPDYPseudos in {
       defvar xInst = !cast<VOP_Pseudo>(x);
@@ -192,3 +249,41 @@ foreach Gen = [GFX11GenD, GFX12GenD] in {
   }
 }
 
+defvar VOPD3XPseudosExtra = ["V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_FMA_F32_e64", "V_SUB_U32_e32",
+                             "V_LSHRREV_B32_e32", "V_ASHRREV_I32_e32", "V_FMA_F64_e64", "V_ADD_F64_pseudo_e32",
+                             "V_MUL_F64_pseudo_e32", "V_MAX_NUM_F64_e32", "V_MIN_NUM_F64_e32"];
+defvar VOPD3XPseudosGFX1250 = !listconcat(
+                                !filter(x, VOPDXPseudosGFX12, !and(!eq(!find(x, "FMAAK"), -1),
+                                                                   !eq(!find(x, "FMAMK"), -1))),
+                                VOPD3XPseudosExtra);
+defvar VOPD3YPseudosExtra = ["V_BITOP3_B32_e64", "V_FMA_F32_e64"];
+defvar VOPD3YPseudosGFX1250 = !listconcat(
+                                !filter(x, VOPDYPseudosGFX1250, !and(!eq(!find(x, "FMAAK"), -1),
+                                                                     !eq(!find(x, "FMAMK"), -1))),
+                                VOPD3YPseudosExtra);
+
+def GFX1250GenD3 : GFXGenD<GFX1250Gen, VOPD3XPseudosGFX1250, VOPD3YPseudosGFX1250>;
+
+class getOpcMap<string OPName> {
+  defvar BaseName = !substr(OPName,2);
+  string ret = !cond(!eq(BaseName, "BITOP3_B32_e64")   : "BITOP2_B32_e64",
+                     1 : BaseName);
+}
+
+foreach Gen = [GFX1250GenD3] in {
+  foreach x = Gen.VOPDXPseudos in {
+    foreach y = Gen.VOPDYPseudos in {
+      defvar xInst = !cast<VOP_Pseudo>(x);
+      defvar yInst = !cast<VOP_Pseudo>(y);
+      defvar XasVC = !cast<VOPD_Component>(x);
+      defvar YasVC = !cast<VOPD_Component>(y);
+      defvar xAsmName = getRenamed<XasVC.VOPDName, Gen>.ret;
+      defvar yAsmName = getRenamed<YasVC.VOPDName, Gen>.ret;
+      defvar OpName = "V_DUAL_" # getOpcMap<x>.ret # "_X_" # getOpcMap<y>.ret # "_e96" # Gen.Suffix;
+      defvar asm = xAsmName # xInst.Pfl.AsmVOPD3X #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPD3Y;
+      defvar ins = !con(xInst.Pfl.InsVOPD3X, yInst.Pfl.InsVOPD3Y);
+      defvar outs = (outs xInst.Pfl.DstRC:$vdstX, yInst.Pfl.DstRC:$vdstY);
+      def OpName : VOPD3<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>;
+    }
+  }
+}
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 3e01f8cd044e2..df215d23f7f40 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -35,12 +35,17 @@ class VOP <string opName> {
 
 // First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
 defvar VOPDX_Max_Index = 12;
+defvar VOPD3X_Max_Index = 36;
 
-class VOPD_Component<bits<5> OpIn, string vOPDName> {
+class VOPD_Component<bits<6> OpIn, string vOPDName> {
   Instruction BaseVOP = !cast<Instruction>(NAME);
   string VOPDName = "v_dual_" # !substr(vOPDName, 2);
-  bits<5> VOPDOp = OpIn;
+  bits<6> VOPDOp = OpIn;
   bit CanBeVOPDX = !le(VOPDOp, VOPDX_Max_Index);
+  bit CanBeVOPD3X = !and(!le(VOPDOp, VOPD3X_Max_Index),
+                         !and(!ne(vOPDName, "v_bitop2_b32"),
+                              !and(!ne(vOPDName, "v_max_i32"),
+                                   !ne(vOPDName, "v_min_i32"))));
 }
 
 class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -627,9 +632,9 @@ def SDWA {
 class VOP_SDWAe<VOPProfile P> : Enc64 {
   bits<8> src0;
   bits<3> src0_sel;
-  bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+  bits<5> src0_modifiers; // float: {abs,neg}, int {sext}
   bits<3> src1_sel;
-  bits<2> src1_modifiers;
+  bits<5> src1_modifiers;
   bits<3> dst_sel;
   bits<2> dst_unused;
   bits<1> clamp;
@@ -639,10 +644,10 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
   let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
-  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{4}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
-  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{4}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
 }
 
@@ -663,18 +668,18 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
 class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
   bits<3> src0_sel;
-  bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+  bits<5> src0_modifiers; // float: {abs,neg}, int {sext}
   bits<3> src1_sel;
-  bits<2> src1_modifiers;
+  bits<5> src1_modifiers;
   bits<1> src1_sgpr;
 
   let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
   let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
-  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{4}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
   let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
-  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{4}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
   let Inst{63}    = 0; // src1_sgpr - should be specified in subclass
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index cbd43cde78548..fb72bab03e750 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -115,7 +115,6 @@
 #include <vector>
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "arm-isel"
 
@@ -5519,6 +5518,24 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
       return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
     }
+
+    // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
+    // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
+    // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
+    // Both require less instructions than compare and conditional select.
+    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
+        RHSC->isZero() && CFVal && CFVal->isZero() &&
+        LHS.getValueType() == RHS.getValueType()) {
+      EVT VT = LHS.getValueType();
+      SDValue Shift =
+          DAG.getNode(ISD::SRA, dl, VT, LHS,
+                      DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
+
+      if (CC == ISD::SETGT)
+        Shift = DAG.getNOT(dl, Shift, VT);
+
+      return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
+    }
   }
 
   if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 9c38901f6821f..b6e8ce7d78b23 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -160,18 +160,18 @@ void DXContainerGlobals::addRootSignature(Module &M,
 
   assert(MMI.EntryPropertyVec.size() == 1);
 
-  auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>();
+  auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
   const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
-  const auto &FuncRs = RSA.find(EntryFunction);
+  const std::optional<mcdxbc::RootSignatureDesc> &RS =
+      RSA.getDescForFunction(EntryFunction);
 
-  if (FuncRs == RSA.end())
+  if (!RS)
     return;
 
-  const RootSignatureDesc &RS = FuncRs->second;
   SmallString<256> Data;
   raw_svector_ostream OS(Data);
 
-  RS.write(OS);
+  RS->write(OS);
 
   Constant *Constant =
       ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false);
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index c8866bfefdfc5..703a9e56626c8 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -239,6 +240,11 @@ class DXILPrepareModule : public ModulePass {
       for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
         F.removeParamAttrs(Idx, AttrMask);
 
+      // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
+      if (Intrinsic::ID IID = F.getIntrinsicID();
+          IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
+        F.removeFnAttr(Attribute::Memory);
+
       for (auto &BB : F) {
         IRBuilder<> Builder(&BB);
         for (auto &I : make_early_inc_range(BB)) {
@@ -247,7 +253,7 @@ class DXILPrepareModule : public ModulePass {
 
           // Emtting NoOp bitcast instructions allows the ValueEnumerator to be
           // unmodified as it reserves instruction IDs during contruction.
-          if (auto LI = dyn_cast<LoadInst>(&I)) {
+          if (auto *LI = dyn_cast<LoadInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, LI->getPointerOperand(),
                     LI->getType())) {
@@ -257,7 +263,7 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto SI = dyn_cast<StoreInst>(&I)) {
+          if (auto *SI = dyn_cast<StoreInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, SI->getPointerOperand(),
                     SI->getValueOperand()->getType())) {
@@ -268,7 +274,7 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, GEP->getPointerOperand(),
                     GEP->getSourceElementType()))
@@ -280,6 +286,17 @@ class DXILPrepareModule : public ModulePass {
             CB->removeRetAttrs(AttrMask);
             for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
               CB->removeParamAttrs(Idx, AttrMask);
+            // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we
+            // insert a bitcast here to ensure that is the case
+            if (isa<LifetimeIntrinsic>(CB)) {
+              Value *PtrOperand = CB->getArgOperand(1);
+              Builder.SetInsertPoint(CB);
+              PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+              Value *NoOpBitcast = Builder.Insert(
+                  CastInst::Create(Instruction::BitCast, PtrOperand,
+                                   Builder.getPtrTy(PtrTy->getAddressSpace())));
+              CB->setArgOperand(1, NoOpBitcast);
+            }
             continue;
           }
         }
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index cfd4107b8a3de..dfc81626da01f 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -596,9 +596,9 @@ analyzeModule(Module &M) {
 
 AnalysisKey RootSignatureAnalysis::Key;
 
-SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
+RootSignatureAnalysis::Result
 RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
-  return analyzeModule(M);
+  return RootSignatureBindingInfo(analyzeModule(M));
 }
 
 //===----------------------------------------------------------------------===//
@@ -606,8 +606,7 @@ RootSignatureAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
 PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
 
-  SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> &RSDMap =
-      AM.getResult<RootSignatureAnalysis>(M);
+  RootSignatureBindingInfo &RSDMap = AM.getResult<RootSignatureAnalysis>(M);
 
   OS << "Root Signature Definitions"
      << "\n";
@@ -678,13 +677,14 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
 
 //===----------------------------------------------------------------------===//
 bool RootSignatureAnalysisWrapper::runOnModule(Module &M) {
-  FuncToRsMap = analyzeModule(M);
+  FuncToRsMap = std::make_unique<RootSignatureBindingInfo>(
+      RootSignatureBindingInfo(analyzeModule(M)));
   return false;
 }
 
 void RootSignatureAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+  AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
 }
 
 char RootSignatureAnalysisWrapper::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index be5cc78bc6bdf..fc39b38258df8 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -10,6 +10,8 @@
 ///       Root Signatures.
 ///
 //===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_DIRECTX_DXILROOTSIGNATURE_H
+#define LLVM_LIB_TARGET_DIRECTX_DXILROOTSIGNATURE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/DXILMetadataAnalysis.h"
@@ -34,6 +36,34 @@ enum class RootSignatureElementKind {
   DescriptorTable = 6,
   StaticSamplers = 7
 };
+
+class RootSignatureBindingInfo {
+private:
+  SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
+
+public:
+  using iterator =
+      SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>::iterator;
+
+  RootSignatureBindingInfo() = default;
+  RootSignatureBindingInfo(
+      SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> Map)
+      : FuncToRsMap(Map) {};
+
+  iterator find(const Function *F) { return FuncToRsMap.find(F); }
+
+  iterator end() { return FuncToRsMap.end(); }
+
+  std::optional<mcdxbc::RootSignatureDesc>
+  getDescForFunction(const Function *F) {
+    const auto FuncRs = find(F);
+    if (FuncRs == end())
+      return std::nullopt;
+
+    return FuncRs->second;
+  }
+};
+
 class RootSignatureAnalysis : public AnalysisInfoMixin<RootSignatureAnalysis> {
   friend AnalysisInfoMixin<RootSignatureAnalysis>;
   static AnalysisKey Key;
@@ -41,10 +71,9 @@ class RootSignatureAnalysis : public AnalysisInfoMixin<RootSignatureAnalysis> {
 public:
   RootSignatureAnalysis() = default;
 
-  using Result = SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>;
+  using Result = RootSignatureBindingInfo;
 
-  SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>
-  run(Module &M, ModuleAnalysisManager &AM);
+  Result run(Module &M, ModuleAnalysisManager &AM);
 };
 
 /// Wrapper pass for the legacy pass manager.
@@ -53,19 +82,13 @@ class RootSignatureAnalysis : public AnalysisInfoMixin<RootSignatureAnalysis> {
 /// passes which run through the legacy pass manager.
 class RootSignatureAnalysisWrapper : public ModulePass {
 private:
-  SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap;
+  std::unique_ptr<RootSignatureBindingInfo> FuncToRsMap;
 
 public:
   static char ID;
-
   RootSignatureAnalysisWrapper() : ModulePass(ID) {}
 
-  using iterator =
-      SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc>::iterator;
-
-  iterator find(const Function *F) { return FuncToRsMap.find(F); }
-
-  iterator end() { return FuncToRsMap.end(); }
+  RootSignatureBindingInfo &getRSInfo() { return *FuncToRsMap; }
 
   bool runOnModule(Module &M) override;
 
@@ -84,3 +107,4 @@ class RootSignatureAnalysisPrinter
 
 } // namespace dxil
 } // namespace llvm
+#endif
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index bd3349d2e18c5..eb4adfea5aed6 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -152,7 +152,7 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
   if (!CSF.Int64Ops)
     CSF.Int64Ops = I.getType()->isIntegerTy(64);
 
-  if (!CSF.Int64Ops) {
+  if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) {
     for (const Value *Op : I.operands()) {
       if (Op->getType()->isIntegerTy(64)) {
         CSF.Int64Ops = true;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 1d79c3018439e..46d5d7177c198 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2545,6 +2545,25 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
   Vals.clear();
 }
 
+// HLSL Change
+namespace {
+struct ValueNameCreator {
+  MallocAllocator Allocator;
+  SmallVector<ValueName *, 2>
+      ValueNames; // SmallVector N = 2 because we currently only expect this
+                  // to hold ValueNames for Lifetime intrinsics
+  ~ValueNameCreator() {
+    for (auto *VN : ValueNames)
+      VN->Destroy(Allocator);
+  }
+  ValueName *create(StringRef Name, Value *V) {
+    ValueName *VN = ValueName::create(Name, Allocator, V);
+    ValueNames.push_back(VN);
+    return VN;
+  }
+};
+} // anonymous namespace
+
 // Emit names for globals/functions etc.
 void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
     const ValueSymbolTable &VST) {
@@ -2559,9 +2578,24 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
   // to ensure the binary is the same no matter what values ever existed.
   SmallVector<const ValueName *, 16> SortedTable;
 
+  // HLSL Change
+  ValueNameCreator VNC;
   for (auto &VI : VST) {
-    SortedTable.push_back(VI.second->getValueName());
+    ValueName *VN = VI.second->getValueName();
+    // Clang mangles lifetime intrinsic names by appending '.p0' to the end,
+    // making them invalid lifetime intrinsics in LLVM 3.7. We can't
+    // demangle in dxil-prepare because it would result in invalid IR.
+    // Therefore we have to do this in the bitcode writer while writing its
+    // name to the symbol table.
+    if (const Function *Fn = dyn_cast<Function>(VI.getValue());
+        Fn && Fn->isIntrinsic()) {
+      Intrinsic::ID IID = Fn->getIntrinsicID();
+      if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
+        VN = VNC.create(Intrinsic::getBaseName(IID), VI.second);
+    }
+    SortedTable.push_back(VN);
   }
+
   // The keys are unique, so there shouldn't be stability issues.
   llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
     return A->first() < B->first();
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index acd5b58c48785..ec73e58ce5d44 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1762,6 +1762,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SRL, VT, Custom);
   }
 
+  setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+  setOperationAction(ISD::SADDSAT, MVT::i64, Legal);
+
   // Extending loads from (native) vectors of i8 into (native) vectors of i16
   // are legal.
   setLoadExtAction(ISD::EXTLOAD,  MVT::v2i16, MVT::v2i8, Legal);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 0e13dd3214da6..f1fa40c1b9036 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -117,6 +117,8 @@ HexagonTargetLowering::initializeHVXLowering() {
   setOperationAction(ISD::VECTOR_SHUFFLE,          ByteW, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  if (Subtarget.useHVX128BOps())
+    setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
   if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
       Subtarget.useHVXFloatingPoint()) {
 
@@ -204,6 +206,8 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::CTLZ,           T, Legal);
     setOperationAction(ISD::SELECT,         T, Legal);
     setOperationAction(ISD::SPLAT_VECTOR,   T, Legal);
+    setOperationAction(ISD::UADDSAT, T, Legal);
+    setOperationAction(ISD::SADDSAT, T, Legal);
     if (T != ByteV) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
@@ -295,6 +299,8 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::CTPOP,    T, Custom);
 
     setOperationAction(ISD::ADD,      T, Legal);
+    setOperationAction(ISD::UADDSAT, T, Legal);
+    setOperationAction(ISD::SADDSAT, T, Legal);
     setOperationAction(ISD::SUB,      T, Legal);
     setOperationAction(ISD::MUL,      T, Custom);
     setOperationAction(ISD::MULHS,    T, Custom);
@@ -2001,6 +2007,28 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
 
     return DAG.getNode(ISD::BUILD_PAIR, dl, ResTy, Combines);
   }
+
+  // Handle bitcast from i32, v2i16, and v4i8 to v32i1.
+  // Splat the input into a 32-element i32 vector, then AND each element
+  // with a unique bitmask to isolate individual bits.
+  if (ResTy == MVT::v32i1 &&
+      (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
+      Subtarget.useHVX128BOps()) {
+    SDValue Val32 = Val;
+    if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
+      Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
+
+    MVT VecTy = MVT::getVectorVT(MVT::i32, 32);
+    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32);
+    SmallVector<SDValue, 32> Mask;
+    for (unsigned i = 0; i < 32; ++i)
+      Mask.push_back(DAG.getConstant(1ull << i, dl, MVT::i32));
+
+    SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask);
+    SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec);
+    return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded);
+  }
+
   if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) {
     // Handle bitcast from i128 -> v128i1 and i64 -> v64i1.
     unsigned BitWidth = ValTy.getSizeInBits();
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 2a991bafbf148..82d999ad820ed 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -582,6 +582,13 @@ def: Pat<(v8i1 (trunc V8I8:$Rs)),
          (A4_vcmpbeqi (Combinew (A2_andir (HiReg $Rs), (i32 0x01010101)),
                                 (A2_andir (LoReg $Rs), (i32 0x01010101))),
                       (i32 1))>;
+def : Pat<(v4i1 (trunc V4I8:$Rs)),
+          (A4_vcmpheqi (Combinew (A2_andir (HiReg (S2_vzxtbh $Rs)), 0x00010001),
+                                 (A2_andir (LoReg (S2_vzxtbh $Rs)), 0x00010001)),
+                       (i32 1))>;
+def: Pat<(v2i1 (trunc V2I16:$Rs)),
+          (A4_vcmpweqi (A2_andp (S2_vzxthw $Rs), (A2_combineii (i32 1), (i32 1))),
+                      (i32 1))>;
 
 
 // Saturation:
@@ -1517,6 +1524,14 @@ def: Pat<(or  I32:$Rs, anyimm:$s10),   (A2_orir   I32:$Rs,  imm:$s10)>;
 def: Pat<(and I32:$Rs, anyimm:$s10),   (A2_andir  I32:$Rs,  imm:$s10)>;
 def: Pat<(sub anyimm:$s10, I32:$Rs),   (A2_subri  imm:$s10, I32:$Rs)>;
 
+class OpR_RR_pat_sat<InstHexagon MI, SDNode Op, ValueType ResType,
+                     PatFrag RxPred>
+  : Pat<(ResType (Op RxPred:$Rs, RxPred:$Rt)),
+        (MI RxPred:$Rs, RxPred:$Rt)>;
+
+def: OpR_RR_pat_sat<A2_addsat,  saddsat, i32, I32>;
+def: OpR_RR_pat_sat<A2_addpsat, saddsat, i64, I64>;
+
 def: OpR_RR_pat<A2_add,       Add,        i32,   I32>;
 def: OpR_RR_pat<A2_sub,       Sub,        i32,   I32>;
 def: OpR_RR_pat<A2_and,       And,        i32,   I32>;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index ba449eaeed34c..fb2ef59d99ef1 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -426,6 +426,21 @@ let Predicates = [UseHVX] in {
            (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
 }
 
+let Predicates = [UseHVX] in {
+  def: OpR_RR_pat_sat<V6_vaddubsat,    uaddsat, VecI8,   HVI8>;
+  def: OpR_RR_pat_sat<V6_vadduhsat,    uaddsat, VecI16,  HVI16>;
+  def: OpR_RR_pat_sat<V6_vadduwsat,    uaddsat, VecI32,  HVI32>;
+  def: OpR_RR_pat_sat<V6_vaddbsat,     saddsat, VecI8,   HVI8>;
+  def: OpR_RR_pat_sat<V6_vaddhsat,     saddsat, VecI16,  HVI16>;
+  def: OpR_RR_pat_sat<V6_vaddwsat,     saddsat, VecI32,  HVI32>;
+  def: OpR_RR_pat_sat<V6_vaddubsat_dv, uaddsat, VecPI8,  HWI8>;
+  def: OpR_RR_pat_sat<V6_vadduhsat_dv, uaddsat, VecPI16, HWI16>;
+  def: OpR_RR_pat_sat<V6_vadduwsat_dv, uaddsat, VecPI32, HWI32>;
+  def: OpR_RR_pat_sat<V6_vaddbsat_dv,  saddsat, VecPI8,  HWI8>;
+  def: OpR_RR_pat_sat<V6_vaddhsat_dv,  saddsat, VecPI16, HWI16>;
+  def: OpR_RR_pat_sat<V6_vaddwsat_dv,  saddsat, VecPI32, HWI32>;
+}
+
 // For now, we always deal with vector floating point in SF mode.
 class OpR_RR_pat_conv<InstHexagon MI, PatFrag Op, ValueType ResType,
                       PatFrag RsPred, PatFrag RtPred = RsPred>
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 72dbb44815657..c47987fbf683b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -291,6 +291,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
@@ -352,7 +353,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
@@ -499,6 +501,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:
+    return lowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return lowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::BITREVERSE:
@@ -2522,6 +2526,72 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
   return SDValue();
 }
 
+SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT ResVT = Op.getSimpleValueType();
+  assert(ResVT.is256BitVector() && Op.getNumOperands() == 2);
+
+  unsigned NumOperands = Op.getNumOperands();
+  unsigned NumFreezeUndef = 0;
+  unsigned NumZero = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  SmallSet<SDValue, 4> Undefs;
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    SDValue SubVec = Op.getOperand(i);
+    if (SubVec.isUndef())
+      continue;
+    if (ISD::isFreezeUndef(SubVec.getNode())) {
+      // If the freeze(undef) has multiple uses then we must fold to zero.
+      if (SubVec.hasOneUse()) {
+        ++NumFreezeUndef;
+      } else {
+        ++NumZero;
+        Undefs.insert(SubVec);
+      }
+    } else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      ++NumZero;
+    else {
+      assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+      NonZeros |= 1 << i;
+      ++NumNonZero;
+    }
+  }
+
+  // If we have more than 2 non-zeros, build each half separately.
+  if (NumNonZero > 2) {
+    MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
+    ArrayRef<SDUse> Ops = Op->ops();
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+                             Ops.slice(0, NumOperands / 2));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
+                             Ops.slice(NumOperands / 2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+  }
+
+  // Otherwise, build it up through insert_subvectors.
+  SDValue Vec = NumZero ? DAG.getConstant(0, DL, ResVT)
+                        : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
+                                          : DAG.getUNDEF(ResVT));
+
+  // Replace Undef operands with ZeroVector.
+  for (SDValue U : Undefs)
+    DAG.ReplaceAllUsesWith(U, DAG.getConstant(0, DL, U.getSimpleValueType()));
+
+  MVT SubVT = Op.getOperand(0).getSimpleValueType();
+  unsigned NumSubElems = SubVT.getVectorNumElements();
+  for (unsigned i = 0; i != NumOperands; ++i) {
+    if ((NonZeros & (1 << i)) == 0)
+      continue;
+
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResVT, Vec, Op.getOperand(i),
+                      DAG.getVectorIdxConstant(i * NumSubElems, DL));
+  }
+
+  return Vec;
+}
+
 SDValue
 LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 60dc2b385a75c..6b49a98f3ae46 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -376,6 +376,7 @@ class LoongArchTargetLowering : public TargetLowering {
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..95e9fd49d1c0d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1860,12 +1860,6 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
                (XVFTINTRZ_LU_D v4f64:$vj)),
               sub_128)>;
 
-// XVPERMI_Q
-foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
-def : Pat<(vt (concat_vectors LSX128:$vd, LSX128:$vj)),
-          (XVPERMI_Q (SUBREG_TO_REG (i64 0), LSX128:$vd, sub_128),
-                     (SUBREG_TO_REG (i64 0), LSX128:$vj, sub_128), 2)>;
-
 // XVABSD_{B/H/W/D}[U]
 defm : PatXrXr<abds, "XVABSD">;
 defm : PatXrXrU<abdu, "XVABSD">;
@@ -1879,6 +1873,35 @@ def : Pat<(loongarch_xvmskgez (v32i8 LASX256:$vj)), (PseudoXVMSKGEZ_B LASX256:$v
 def : Pat<(loongarch_xvmskeqz (v32i8 LASX256:$vj)), (PseudoXVMSKEQZ_B LASX256:$vj)>;
 def : Pat<(loongarch_xvmsknez (v32i8 LASX256:$vj)), (PseudoXVMSKNEZ_B LASX256:$vj)>;
 
+// Subvector tricks
+// Patterns for insert_subvector/extract_subvector
+multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
+                                     RegisterClass RC, ValueType VT,
+                                     int hiIdx, SubRegIndex subIdx> {
+  // A 128-bit subvector extract from the first 256-bit vector position is a
+  // subregister copy that needs no instruction. Likewise, a 128-bit subvector
+  // insert to the first 256-bit vector position is a subregister copy that needs
+  // no instruction.
+  def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+            (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
+  def : Pat<(VT (insert_subvector undef_or_freeze_undef, subRC:$src, (iPTR 0))),
+            (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
+
+  def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR hiIdx))),
+            (subVT (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), RC:$src, 1), subIdx))>;
+  def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR 0))),
+            (VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 48))>;
+  def : Pat<(VT (insert_subvector RC:$vd, subRC:$vj, (iPTR hiIdx))),
+            (VT (XVPERMI_Q RC:$vd, (INSERT_SUBREG (IMPLICIT_DEF), subRC:$vj, subIdx), 2))>;
+}
+
+defm : subvector_subreg_lowering<LSX128, v4i32, LASX256, v8i32,  4,  sub_128>;
+defm : subvector_subreg_lowering<LSX128, v4f32, LASX256, v8f32,  4,  sub_128>;
+defm : subvector_subreg_lowering<LSX128, v2i64, LASX256, v4i64,  2,  sub_128>;
+defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
+defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
+defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
+
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 429d52fb6f230..ae73d8da79f8e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -446,11 +446,18 @@ bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
   SDValue Vector = N->getOperand(0);
 
-  // We only care about 16x2 as it's the only real vector type we
-  // need to deal with.
   MVT VT = Vector.getSimpleValueType();
-  if (!Isv2x16VT(VT))
+  if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
     return false;
+
+  unsigned Opcode;
+  if (VT.is32BitVector())
+    Opcode = NVPTX::I32toV2I16;
+  else if (VT.is64BitVector())
+    Opcode = NVPTX::I64toV2I32;
+  else
+    llvm_unreachable("Unhandled packed type");
+
   // Find and record all uses of this vector that extract element 0 or 1.
   SmallVector<SDNode *, 4> E0, E1;
   for (auto *U : Vector.getNode()->users()) {
@@ -474,11 +481,11 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
   if (E0.empty() || E1.empty())
     return false;
 
-  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
-  // into f16,f16 SplitF16x2(V)
+  // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
+  // into EltTy,EltTy Split[EltTy]x2(V)
   MVT EltVT = VT.getVectorElementType();
   SDNode *ScatterOp =
-      CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
+      CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
   for (auto *Node : E0)
     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
   for (auto *Node : E1)
@@ -994,6 +1001,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
   case MVT::i32:
   case MVT::f32:
     return Opcode_i32;
+  case MVT::v2f32:
   case MVT::i64:
   case MVT::f64:
     return Opcode_i64;
@@ -2147,16 +2155,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
        ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix   \
        : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
 
-#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32)     \
-  (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH))           \
-         : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
-
-#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch,       \
-                                            is_s32)                            \
-  (is_reduce                                                                   \
-       ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
-       : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch,          \
-                                               is_s32)))
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)      \
+  (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH))          \
+         : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
 
 #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)   \
   [&]() -> auto {                                                              \
@@ -2169,48 +2170,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
     return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, );              \
   }()
 
-#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch)             \
-  (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH            \
-         : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
-
-static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
-                                              bool IsCacheHint, bool IsIm2Col,
-                                              bool IsReduce = false) {
+static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
+                                                       bool IsShared32,
+                                                       bool IsCacheHint,
+                                                       bool IsIm2Col) {
   if (IsIm2Col) {
     switch (Dim) {
     case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
+                                                     IsShared32);
     case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
+                                                     IsShared32);
     case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
+                                                     IsShared32);
     default:
       llvm_unreachable("Invalid Dimension in im2col mode for "
-                       "GetCpAsyncBulkTensorS2GOpcode.");
+                       "GetCpAsyncBulkTensorS2GReductionOpcode.");
     }
   } else {
     switch (Dim) {
     case 1:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
+                                                     IsShared32);
     case 2:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
+                                                     IsShared32);
     case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
+                                                     IsShared32);
     case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
+                                                     IsShared32);
     case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
-                                                 IsCacheHint, IsShared32);
+      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
+                                                     IsShared32);
     default:
-      llvm_unreachable(
-          "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
+      llvm_unreachable("Invalid Dimension in tile mode for "
+                       "GetCpAsyncBulkTensorS2GReductionOpcode.");
     }
   }
 }
@@ -2257,39 +2255,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
   }
 }
 
-static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
-                                                   bool IsIm2Col) {
-  if (IsIm2Col) {
-    switch (Dim) {
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
-    default:
-      llvm_unreachable("Invalid Dimension in im2col mode for "
-                       "GetCpAsyncBulkTensorPrefetchOpcode.");
-    }
-  } else {
-    switch (Dim) {
-    case 1:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
-    case 2:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
-    case 3:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
-    case 4:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
-    case 5:
-      return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
-    default:
-      llvm_unreachable("Invalid Dimension in tile mode for "
-                       "GetCpAsyncBulkTensorPrefetchOpcode.");
-    }
-  }
-}
-
 static size_t GetDimsFromIntrinsic(unsigned IID) {
   switch (IID) {
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2354,52 +2319,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
   ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
 }
 
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
-                                                         bool IsIm2Col) {
-  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
-  // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
-  // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {4 + dims}
-  size_t NumOps = N->getNumOperands();
-  size_t NumDims = NumOps - 6;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
-  size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
-
-  SDLoc DL(N);
-  SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
-  Ops.push_back(N->getOperand(0)); // Chain operand
-
-  bool IsShared32 =
-      CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
-  unsigned Opcode =
-      GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
-  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
-                                                              bool IsIm2Col) {
-  // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
-  // {src, dims{d0...dN}, im2col_offsets{dims-2}
-  // cache_hint, cache_hint_flag}
-  // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {3 + dims + im2col_offsets}
-  size_t NumOps = N->getNumOperands();
-  size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
-                            : (NumOps - 5);
-  // Offsets is always 'NumDims - 2' and only for im2col mode
-  size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
-  size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
-
-  SDLoc DL(N);
-  SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
-  Ops.push_back(N->getOperand(0)); // Chain operand
-
-  unsigned Opcode =
-      GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
-  ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
 void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
                                                             unsigned RedOp,
                                                             bool IsIm2Col) {
@@ -2419,8 +2338,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
 
   bool IsShared32 =
       CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
-  unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
-      NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
+  unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode(
+      NumDims, IsShared32, IsCacheHint, IsIm2Col);
   ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
 }
 
@@ -2540,18 +2459,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
-    SelectCpAsyncBulkTensorS2GCommon(N);
-    return true;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
-    SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
-    return true;
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2564,18 +2471,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
     SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
     return true;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
-    SelectCpAsyncBulkTensorPrefetchCommon(N);
-    return true;
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
-  case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
-    SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
-    return true;
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b314c4ccefe8b..88e5328ff69c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   void SelectV2I64toI128(SDNode *N);
   void SelectI128toV2I64(SDNode *N);
   void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
-  void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
-  void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
   void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
                                            bool IsIm2Col = false);
   void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb0aeb493ed48..3d010e04824c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -196,11 +196,6 @@ static bool IsPTXVectorType(MVT VT) {
   }
 }
 
-static bool Is16bitsType(MVT VT) {
-  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
-          VT.SimpleTy == MVT::i16);
-}
-
 // When legalizing vector loads/stores, this function is called, which does two
 // things:
 // 1. Determines Whether the vector is something we want to custom lower,
@@ -223,6 +218,9 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
   const MVT EltVT = VectorVT.getVectorElementType();
   const unsigned NumElts = VectorVT.getVectorNumElements();
 
+  // The size of the PTX virtual register that holds a packed type.
+  unsigned PackRegSize;
+
   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   // legal.  We can (and should) split that into 2 stores of <2 x double> here
   // but I'm leaving that as a TODO for now.
@@ -232,7 +230,6 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
   case MVT::v4i64:
   case MVT::v4f64:
   case MVT::v8i32:
-  case MVT::v8f32:
     // This is a "native" vector type iff the address space is global
     // and the target supports 256-bit loads/stores
     if (!CanLowerTo256Bit)
@@ -241,10 +238,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
   case MVT::v2i8:
   case MVT::v2i32:
   case MVT::v2i64:
-  case MVT::v2f32:
   case MVT::v2f64:
   case MVT::v4i32:
-  case MVT::v4f32:
     // This is a "native" vector type
     return std::pair(NumElts, EltVT);
   case MVT::v16f16:  // <8 x f16x2>
@@ -268,22 +263,26 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
   case MVT::v8bf16: // <4 x bf16x2>
   case MVT::v8i16:  // <4 x i16x2>
   case MVT::v16i8:  // <4 x i8x4>
-    // This can be upsized into a "native" vector type.
-    // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
-    // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
-    // vectorized loads/stores with the actual element type for i8/i16 as that
-    // would require v8/v16 variants that do not exist.
-    // In order to load/store such vectors efficiently, here in Type
-    // Legalization, we split the vector into word-sized chunks (v2x16/v4i8).
-    // Later, we will lower to PTX as vectors of b32.
+    PackRegSize = 32;
+    break;
+  case MVT::v8f32: // <4 x f32x2>
+    if (!CanLowerTo256Bit)
+      return std::nullopt;
+    LLVM_FALLTHROUGH;
+  case MVT::v2f32: // <1 x f32x2>
+  case MVT::v4f32: // <2 x f32x2>
+    PackRegSize = 64;
+    break;
+  }
 
-    // Number of elements to pack in one word.
-    const unsigned NPerWord = 32 / EltVT.getSizeInBits();
+  // If we reach here, then we can pack 2 or more elements into a single 32-bit
+  // or 64-bit PTX register and treat the vector as a new vector containing
+  // packed elements.
 
-    return std::pair(NumElts / NPerWord, MVT::getVectorVT(EltVT, NPerWord));
-  }
+  // Number of elements to pack in one word.
+  const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
 
-  llvm_unreachable("All cases in switch should return.");
+  return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
 }
 
 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -330,53 +329,49 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     return;
   }
 
+  // Will split structs and arrays into member types, but will not split vector
+  // types. We do that manually below.
   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
-  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
-    EVT VT = TempVTs[i];
-    uint64_t Off = TempOffsets[i];
-    // Split vectors into individual elements, except for v2f16, which
-    // we will pass as a single scalar.
+
+  for (auto [VT, Off] : zip(TempVTs, TempOffsets)) {
+    // Split vectors into individual elements that fit into registers.
     if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
       EVT EltVT = VT.getVectorElementType();
-      // We require power-of-2 sized vectors because
+      // Below we must maintain power-of-2 sized vectors because
       // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
       // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
       // vectors.
-      if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
-          isPowerOf2_32(NumElts)) {
-        // Vectors with an even number of f16 elements will be passed to
-        // us as an array of v2f16/v2bf16 elements. We must match this so we
-        // stay in sync with Ins/Outs.
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        case MVT::f16:
-          EltVT = MVT::v2f16;
-          break;
-        case MVT::bf16:
-          EltVT = MVT::v2bf16;
-          break;
-        case MVT::i16:
-          EltVT = MVT::v2i16;
-          break;
-        default:
-          llvm_unreachable("Unexpected type");
-        }
-        NumElts /= 2;
-      } else if (EltVT.getSimpleVT() == MVT::i8 &&
-                 ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) ||
-                  NumElts == 3)) {
-        // v*i8 are formally lowered as v4i8
+
+      // If the element type belongs to one of the supported packed vector types
+      // then we can pack multiples of this element into a single register.
+      if (VT == MVT::v2i8) {
+        // We can pack 2 i8s into a single 16-bit register. We only do this for
+        // loads and stores, which is why we have a separate case for it.
+        EltVT = MVT::v2i8;
+        NumElts = 1;
+      } else if (VT == MVT::v3i8) {
+        // We can also pack 3 i8s into 32-bit register, leaving the 4th
+        // element undefined.
         EltVT = MVT::v4i8;
-        NumElts = (NumElts + 3) / 4;
-      } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
-        // v2i8 is promoted to v2i16
         NumElts = 1;
-        EltVT = MVT::v2i8;
+      } else if (NumElts > 1 && isPowerOf2_32(NumElts)) {
+        // Handle default packed types.
+        for (MVT PackedVT : NVPTX::packed_types()) {
+          const auto NumEltsPerReg = PackedVT.getVectorNumElements();
+          if (NumElts % NumEltsPerReg == 0 &&
+              EltVT == PackedVT.getVectorElementType()) {
+            EltVT = PackedVT;
+            NumElts /= NumEltsPerReg;
+            break;
+          }
+        }
       }
-      for (unsigned j = 0; j != NumElts; ++j) {
+
+      for (unsigned J : seq(NumElts)) {
         ValueVTs.push_back(EltVT);
         if (Offsets)
-          Offsets->push_back(Off + j * EltVT.getStoreSize());
+          Offsets->push_back(Off + J * EltVT.getStoreSize());
       }
     } else {
       ValueVTs.push_back(VT);
@@ -594,6 +589,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
   addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
   addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
+  addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
 
   // Conversion to/from FP16/FP16x2 is always legal.
   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -630,6 +626,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
 
+  // No support for these operations with v2f32.
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
+
   // Custom conversions to/from v2i8.
   setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
 
@@ -655,12 +655,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 
   // Operations not directly supported by NVPTX.
   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
-                 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
-                 MVT::i32, MVT::i64}) {
+                 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
+                 MVT::v4i8, MVT::i32, MVT::i64}) {
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     setOperationAction(ISD::BR_CC, VT, Expand);
   }
 
+  // Not directly supported. TLI would attempt to expand operations like
+  // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes.
+  setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
+
   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
   // For others we will expand to a SHL/SRA pair.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
@@ -857,6 +861,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
     if (getOperationAction(Op, MVT::bf16) == Promote)
       AddPromotedToType(Op, MVT::bf16, MVT::f32);
+    setOperationAction(Op, MVT::v2f32,
+                       STI.hasF32x2Instructions() ? Legal : Expand);
   }
 
   // On SM80, we select add/mul/sub as fma to avoid promotion to float
@@ -878,6 +884,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 
   setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
   setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
+  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   // (would be) Library functions.
 
   // These map to conversion instructions for scalar FP types.
@@ -888,6 +895,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setOperationAction(Op, MVT::f64, Legal);
     setOperationAction(Op, MVT::v2f16, Expand);
     setOperationAction(Op, MVT::v2bf16, Expand);
+    setOperationAction(Op, MVT::v2f32, Expand);
     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
     if (getOperationAction(Op, MVT::bf16) == Promote)
       AddPromotedToType(Op, MVT::bf16, MVT::f32);
@@ -903,6 +911,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     }
   }
 
+  // Expand v2f32 = fp_extend
+  setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+  // Expand v2[b]f16 = fp_round v2f32
+  setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
+
   // sm_80 only has conversions between f32 and bf16. Custom lower all other
   // bf16 conversions.
   if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
@@ -940,14 +953,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
-    setOperationAction(Op, MVT::v2f16, Expand);
-    setOperationAction(Op, MVT::v2bf16, Expand);
+    setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
     setOperationAction(Op, MVT::bf16, Promote);
     AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
   setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
 
   setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
+  setOperationAction(ISD::FABS, MVT::v2f32, Expand);
   if (STI.getPTXVersion() >= 65) {
     setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
     setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
@@ -969,6 +982,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
     if (getOperationAction(Op, MVT::bf16) == Promote)
       AddPromotedToType(Op, MVT::bf16, MVT::f32);
+    setOperationAction(Op, MVT::v2f32, Expand);
   }
   bool SupportsF32MinMaxNaN =
       STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
@@ -978,6 +992,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
     setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
+    setOperationAction(Op, MVT::v2f32, Expand);
   }
 
   // Custom lowering for inline asm with 128-bit operands
@@ -990,6 +1005,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // - bf16/bf16x2 (sm_90+, PTX 7.8+)
   // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
   setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
   setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
   setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
   setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
@@ -1001,7 +1017,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setOperationAction(ISD::FLOG2, MVT::f32, Legal);
     setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
     setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
-    setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
+    setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
+                       Expand);
   }
 
   setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
@@ -2074,7 +2091,7 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                SelectionDAG &DAG) const {
   EVT VT = Op->getValueType(0);
-  if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
+  if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
     return Op;
   SDLoc DL(Op);
 
@@ -2124,15 +2141,26 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       Value = Value.trunc(8);
     return Value.zext(32);
   };
-  APInt Value;
-  if (Isv2x16VT(VT)) {
-    Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
-  } else if (VT == MVT::v4i8) {
-    Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
-            GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
-  } else {
-    llvm_unreachable("Unsupported type");
-  }
+
+  // Construct a 32-bit constant by shifting into place smaller values
+  // (elements of the vector type VT).
+  // For example, if VT has 2 elements, then N == 2:
+  //   ShiftAmount = 32 / N = 16
+  //   Value |= Op0 (b16) << 0
+  //   Value |= Op1 (b16) << 16
+  // If N == 4:
+  //   ShiftAmount = 32 / N = 8
+  //   Value |= Op0 (b8) << 0
+  //   Value |= Op1 (b8) << 8
+  //   Value |= Op2 (b8) << 16
+  //   Value |= Op3 (b8) << 24
+  // ...etc
+  APInt Value(32, 0);
+  const unsigned NumElements = VT.getVectorNumElements();
+  assert(32 % NumElements == 0 && "must evenly divide bit length");
+  const unsigned ShiftAmount = 32 / NumElements;
+  for (unsigned ElementNo : seq(NumElements))
+    Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
   SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
   return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
 }
@@ -2160,7 +2188,8 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return Op;
 
   // Extract individual elements and select one of them.
-  assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
+  assert(NVPTX::isPackedVectorTy(VectorVT) &&
+         VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
   EVT EltVT = VectorVT.getVectorElementType();
 
   SDLoc dl(Op.getNode());
@@ -3069,14 +3098,19 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
+static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &Results,
+                              const NVPTXSubtarget &STI);
+
 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1)
     return LowerLOADi1(Op, DAG);
 
-  // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
-  // unaligned loads and have to handle it here.
   EVT VT = Op.getValueType();
-  if (Isv2x16VT(VT) || VT == MVT::v4i8) {
+
+  if (NVPTX::isPackedVectorTy(VT)) {
+    // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
+    // handle unaligned loads and have to handle it here.
     LoadSDNode *Load = cast<LoadSDNode>(Op);
     EVT MemVT = Load->getMemoryVT();
     if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -3120,17 +3154,19 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::i1)
     return LowerSTOREi1(Op, DAG);
 
-  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
-  // stores and have to handle it here.
-  if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
+  // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
+  // handle unaligned stores and have to handle it here.
+  if (NVPTX::isPackedVectorTy(VT) &&
       !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
                                       VT, *Store->getMemOperand()))
     return expandUnalignedStore(Store, DAG);
 
-  // v2f16, v2bf16 and v2i16 don't need special handling.
-  if (Isv2x16VT(VT) || VT == MVT::v4i8)
+  // v2f16/v2bf16/v2i16 don't need special handling.
+  if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
     return SDValue();
 
+  // Lower store of any other vector type, including v2f32 as we want to break
+  // it apart since this is not a widely-supported type.
   return LowerSTOREVector(Op, DAG);
 }
 
@@ -4920,7 +4956,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// Fold extractelts into a load by increasing the number of return values.
+/// Fold unpacking movs into a load by increasing the number of return values.
 ///
 /// ex:
 /// L: v2f16,ch = load <p>
@@ -4929,6 +4965,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
 /// use(a, b)
 ///
 /// ...is turned into...
+///
 /// L: f16,f16,ch = LoadV2 <p>
 /// use(L:0, L:1)
 static SDValue
@@ -4937,10 +4974,13 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   if (!DCI.isAfterLegalizeDAG())
     return SDValue();
 
-  EVT ElemVT = N->getValueType(0);
-  if (!Isv2x16VT(ElemVT))
+  EVT ElementVT = N->getValueType(0);
+  // Avoid non-packed types and v4i8
+  if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
     return SDValue();
 
+  SmallVector<SDNode *> DeadCopyToRegs;
+
   // Check whether all outputs are either used by an extractelt or are
   // glue/chain nodes
   if (!all_of(N->uses(), [&](SDUse &U) {
@@ -4968,6 +5008,12 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
           return !U.getUser()->use_empty();
         }
 
+        // Handle CopyToReg nodes that will become dead after our replacement
+        if (U.getUser()->getOpcode() == ISD::CopyToReg) {
+          DeadCopyToRegs.push_back(U.getUser());
+          return true;
+        }
+
         // Otherwise, this use prevents us from splitting a value.
         return false;
       }))
@@ -5000,6 +5046,13 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     Opcode = NVPTXISD::LoadV4;
     break;
   case NVPTXISD::LoadV4:
+    // V8 is only supported for f32. Don't forget, we're not changing the load
+    // size here. This is already a 256-bit load.
+    if (ElementVT != MVT::v2f32)
+      return SDValue();
+    OldNumOutputs = 4;
+    Opcode = NVPTXISD::LoadV8;
+    break;
   case NVPTXISD::LoadV8:
     // PTX doesn't support the next doubling of outputs
     return SDValue();
@@ -5007,7 +5060,7 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   // the non-glue, non-chain outputs in the new load
   const unsigned NewNumOutputs = OldNumOutputs * 2;
-  SmallVector<EVT> NewVTs(NewNumOutputs, ElemVT.getVectorElementType());
+  SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
   // add remaining chain and glue values
   NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
 
@@ -5022,23 +5075,28 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SmallVector<SDValue> Results;
   for (unsigned I : seq(OldNumOutputs))
     Results.push_back(DCI.DAG.getBuildVector(
-        ElemVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
+        ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
   // Add remaining chain and glue nodes
   for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
     Results.push_back(NewLoad.getValue(NewNumOutputs + I));
 
+  // Remove dead CopyToReg nodes by folding them into the chain they reference
+  for (SDNode *CTR : DeadCopyToRegs)
+    DCI.CombineTo(CTR, CTR->getOperand(0));
+
   return DCI.DAG.getMergeValues(Results, DL);
 }
 
-/// Fold a packing mov into a store.
+/// Fold packing movs into a store.
 ///
 /// ex:
-/// v: v2f16 = BUILD_VECTOR a:f16, b:f16
-/// StoreRetval v
+/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
+/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
+/// StoreV2 v1, v2
 ///
 /// ...is turned into...
 ///
-/// StoreRetvalV2 a:f16, b:f16
+/// StoreV4 a, b, c, d
 static SDValue combinePackingMovIntoStore(SDNode *N,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           unsigned Front, unsigned Back) {
@@ -5050,7 +5108,8 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
   // Get the type of the operands being stored.
   EVT ElementVT = N->getOperand(Front).getValueType();
 
-  if (!Isv2x16VT(ElementVT))
+  // Avoid non-packed types and v4i8
+  if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
     return SDValue();
 
   auto *ST = cast<MemSDNode>(N);
@@ -5077,6 +5136,12 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
     Opcode = NVPTXISD::StoreV4;
     break;
   case NVPTXISD::StoreV4:
+    // V8 is only supported for f32. Don't forget, we're not changing the store
+    // size here. This is already a 256-bit store.
+    if (ElementVT != MVT::v2f32)
+      return SDValue();
+    Opcode = NVPTXISD::StoreV8;
+    break;
   case NVPTXISD::StoreParamV4:
   case NVPTXISD::StoreV8:
     // PTX doesn't support the next doubling of operands
@@ -5606,10 +5671,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
       IsPTXVectorType(VectorVT.getSimpleVT()))
     return SDValue(); // Native vector loads already combine nicely w/
                       // extract_vector_elt.
-  // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
-  // handle them OK.
-  if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
-      VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
+  // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8),
+  // we already handle them OK.
+  if (VectorVT.getVectorNumElements() == 1 ||
+      NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
     return SDValue();
 
   // Don't mess with undef values as sra may be simplified to 0, not undef.
@@ -5682,7 +5747,10 @@ static SDValue PerformVSELECTCombine(SDNode *N,
 static SDValue
 PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   auto VT = N->getValueType(0);
-  if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
+  if (!DCI.isAfterLegalizeDAG() ||
+      // only process v2*16 types
+      !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
+        VT.getVectorNumElements() == 2))
     return SDValue();
 
   auto Op0 = N->getOperand(0);
@@ -5822,7 +5890,7 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
 }
 
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results,
                               const NVPTXSubtarget &STI) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -6146,7 +6214,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
     ReplaceBITCAST(N, DAG, Results);
     return;
   case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, Results, STI);
+    replaceLoadVector(N, DAG, Results, STI);
     return;
   case ISD::INTRINSIC_W_CHAIN:
     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dcdebb81e3c86..db6b411509e93 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
@@ -199,6 +200,7 @@ def BF16RT   : RegTyInfo<bf16, B16, "bf16", bf16imm, fpimm, supports_imm = 0>;
 
 def F16X2RT  : RegTyInfo<v2f16, B32, "f16x2", ?, ?, supports_imm = 0>;
 def BF16X2RT : RegTyInfo<v2bf16, B32, "bf16x2", ?, ?, supports_imm = 0>;
+def F32X2RT  : RegTyInfo<v2f32, B64, "f32x2", ?, ?, supports_imm = 0>;
 
 
 // This class provides a basic wrapper around an NVPTXInst that abstracts the
@@ -395,6 +397,13 @@ multiclass F3<string op_str, SDPatternOperator op_pat> {
               op_str # "$ftz.f16",
               [(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
               Requires<[useFP16Math]>;
+  def f32x2rr :
+    BasicFlagsNVPTXInst<(outs B64:$dst),
+              (ins B64:$a, B64:$b),
+              (ins FTZFlag:$ftz),
+              op_str # "$ftz.f32x2",
+              [(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>,
+              Requires<[hasF32x2Instructions]>;
   def f16x2rr :
     BasicFlagsNVPTXInst<(outs B32:$dst),
               (ins B32:$a, B32:$b),
@@ -747,6 +756,9 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
           (SELP_b32rr $a, $b, $p)>;
 }
 
+def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)),
+          (SELP_b64rr $a, $b, $p)>;
+
 //-----------------------------------
 // Test Instructions
 //-----------------------------------
@@ -1218,6 +1230,7 @@ defm FMA_F16x2  : FMA<F16X2RT,  allow_ftz = true, preds = [useFP16Math]>;
 defm FMA_BF16   : FMA<BF16RT,   allow_ftz = false, preds = [hasBF16Math]>;
 defm FMA_BF16x2 : FMA<BF16X2RT, allow_ftz = false, preds = [hasBF16Math]>;
 defm FMA_F32    : FMA<F32RT,    allow_ftz = true>;
+defm FMA_F32x2  : FMA<F32X2RT,  allow_ftz = true, preds = [hasF32x2Instructions]>;
 defm FMA_F64    : FMA<F64RT,    allow_ftz = false>;
 
 // sin/cos
@@ -2302,6 +2315,7 @@ def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>;
 def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
          (CVT_INREG_s32_s16 $src)>;
 
+// Handle extracting one element from the pair (32-bit types)
 foreach vt = [v2f16, v2bf16, v2i16] in {
   def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>;
   def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>;
@@ -2313,10 +2327,21 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
             (V2I16toI32 $a, $b)>;
 }
 
+// Same thing for the 64-bit type v2f32.
+foreach vt = [v2f32] in {
+  def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>;
+  def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>;
+
+  def : Pat<(extractelt vt:$src, 0), (I64toI32L $src)>;
+  def : Pat<(extractelt vt:$src, 1), (I64toI32H $src)>;
+
+  def : Pat<(vt (build_vector vt.ElementType:$a, vt.ElementType:$b)), 
+            (V2I32toI64 $a, $b)>;
+}
+
 def: Pat<(v2i16 (scalar_to_vector i16:$a)),
          (CVT_u32_u16 $a, CvtNONE)>;
 
-
 def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>;
 
 def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)),
@@ -2467,7 +2492,7 @@ def : Pat<(brcond i32:$a, bb:$target),
 
 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
 // conditional branch if the target block is the next block so that the code
-// can fall through to the target block.  The invertion is done by 'xor
+// can fall through to the target block.  The inversion is done by 'xor
 // condition, 1', which will be translated to (setne condition, -1).  Since ptx
 // supports '@!pred bra target', we should use it.
 def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
@@ -2707,4 +2732,4 @@ let Predicates = [useFP16Math, hasPTX<70>, hasSM<80>] in {
 let Predicates = [hasBF16Math, hasPTX<70>, hasSM<80>] in {
   def FMARELU_BF16 : FMARELUInst<BF16RT, false, fpimm_any_zero>;
   def FMARELU_BF16X2 : FMARELUInst<BF16X2RT, false, fpimm_positive_zero_v2bf16>;
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d840324ce8238..93827be5c2811 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -560,6 +560,30 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
 // TMA Async Bulk Tensor Copy Functions
 //-------------------------------------
 
+class TMA_DIMS_UTIL<int dim> {
+  // For example, when 'dim' is 3, this generates:
+  // an ins_dag:    B32:$d0, B32:$d1, B32:$d2
+  // with base_str: $d0, $d1, $d2
+  dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
+  string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+}
+
+class TMA_IM2COL_UTIL<int dim, string mode> {
+  // For im2col_w/w_128 modes, number of offsets is always 2.
+  // For im2col mode, offsets is (dim - 2).
+  // For non-im2col modes (i.e. tile) there are no offsets.
+  int offsets = !cond(
+                  !eq(mode, "im2col") : !sub(dim, 2),
+                  !eq(mode, "im2col_w") : 2,
+                  !eq(mode, "im2col_w_128") : 2,
+                  true : 0); // for all other modes
+
+  dag ins_dag = !if(!gt(offsets, 0),
+    !dag(ins, !listsplat(B16, offsets), !foreach(i, !range(offsets), "im2col" # i)),
+    (ins));
+  string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
+}
+
 // From Global to Shared memory (G2S)
 class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
   string prefix = "cp.async.bulk.tensor";
@@ -583,8 +607,8 @@ def CTAGroupFlags : Operand<i32> {
 }
 
 multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
-  defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
-  defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
   defvar rc = !if(is_shared32, B32, B64);
 
@@ -628,39 +652,46 @@ foreach dim = [1, 2, 3, 4, 5] in {
   }
 }
 
-// From Shared to Global memory (S2G)
-class S2G_STRINGS<int dim, string mode, bit ch,
-                  bit is_shared32 = 0, bit is_reduce = 0> {
-  string dir = "global.shared::cta";
-  string completion = "bulk_group";
-  string inst_name = !if(is_reduce, "cp.reduce", "cp")
-                     # ".async.bulk.tensor"
-                     # "." # dim # "d"
-                     # "." # dir
-                     # "." # mode
-                     # "." # completion
-                     # !if(ch, ".L2::cache_hint", "");
-  string intr_name = "CP_ASYNC_BULK_TENSOR_"
-                     # !if(is_reduce, "RED_", "S2G_")
-                     # dim # "D"
-                     # !if(is_shared32, "_SHARED32", "")
-                     # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
-}
-
-multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, bit shared32, string mode> {
-  defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
-  defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
+                               list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
-  defvar rc = !if(shared32, B32, B64);
+
+  defvar intr = !cast<Intrinsic>(
+                  "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d);
+  defvar intr_dag = !con((intr addr:$src, B64:$tmap),
+                         !setdagop(dims_dag, intr),
+                         (intr B64:$ch, 0));
+  defvar intr_dag_with_ch = !con((intr addr:$src, B64:$tmap),
+                                 !setdagop(dims_dag, intr),
+                                 (intr B64:$ch, -1));
+
+  // For im2col mode, the actual asm_str is "im2col_no_offs"
+  defvar mode_asm_str = !if(!eq(mode, "im2col"),
+                            "im2col_no_offs", mode);
+  defvar prefix = "cp.async.bulk.tensor"
+                  # "." # dim # "d"
+                  # ".global.shared::cta"
+                  # "." # mode_asm_str
+                  # ".bulk_group";
 
   def "" : NVPTXInst<(outs),
-            !con((ins rc:$src, B64:$tmap), dims_dag),
-            !strconcat(S2G_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
-            Requires<[hasPTX<80>, hasSM<90>]>;
+             !con((ins ADDR:$src, B64:$tmap), dims_dag, (ins B64:$ch)),
+             prefix # asm_str # ";",
+             [intr_dag]>,
+             Requires<pred>;
   def _CH : NVPTXInst<(outs),
-                  !con((ins rc:$src, B64:$tmap), dims_dag, (ins B64:$ch)),
-                  !strconcat(S2G_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
+              !con((ins ADDR:$src, B64:$tmap), dims_dag, (ins B64:$ch)),
+              prefix # ".L2::cache_hint" # asm_str # ", $ch;",
+              [intr_dag_with_ch]>,
+              Requires<pred>;
+}
+foreach dim = 1...5 in {
+  foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+    defvar suffix = !toupper(mode) # "_" # dim # D;
+    defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>;
+  }
 }
 
 def TMAReductionFlags : Operand<i32> {
@@ -669,13 +700,16 @@ def TMAReductionFlags : Operand<i32> {
 
 // TMA Copy from Shared to Global memory with Reduction
 multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode> {
-  defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
-  defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
   defvar rc = !if(shared32, B32, B64);
 
+  // For im2col mode, the actual asm_str is "im2col_no_offs"
+  defvar mode_asm_str = !if(!eq(mode, "im2col"),
+                            "im2col_no_offs", mode);
   defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta";
-  defvar suffix = "." # mode # ".bulk_group";
+  defvar suffix = "." # mode_asm_str # ".bulk_group";
 
   def "" : NVPTXInst<(outs),
             !con((ins rc:$src, B64:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
@@ -689,58 +723,63 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode>
 
 foreach dim = [1, 2, 3, 4, 5] in {
   foreach shared32 = [true, false] in {
-    foreach mode = !if(!ge(dim, 3), ["tile", "im2col_no_offs"], ["tile"]) in {
-      defm S2G_STRINGS<dim, mode, 0, shared32>.intr_name :
-        CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, shared32, mode>;
-      defm S2G_STRINGS<dim, mode, 0, shared32, 1>.intr_name :
+    foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+      defvar suffix = dim # "D"
+                      # !if(shared32, "_SHARED32", "")
+                      # "_" # !toupper(mode);
+      defm CP_ASYNC_BULK_TENSOR_RED_ # suffix :
         CP_ASYNC_BULK_TENSOR_REDUCE_INTR<dim, shared32, mode>;
     }
   }
 }
 
 // TMA Prefetch from Global memory to L2 cache
-class PREFETCH_STRINGS<int dim, string mode, bit ch> {
-  string prefix = "cp.async.bulk.prefetch.tensor";
-  string dir = "L2.global";
-  string inst_name = prefix
+multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
+                                    list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+  defvar asm_str_base = " [$tmap, {{" # dims_str # "}}]";
+
+  defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+  defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+  defvar asm_str = !if(!empty(im2col_str),
+                       asm_str_base,
+                       asm_str_base # ", {{" # im2col_str # "}}");
+
+  defvar inst_name = "cp.async.bulk.prefetch.tensor"
                      # "." # dim # "d"
-                     # "." # dir
-                     # "." # mode
-                     # !if(ch, ".L2::cache_hint", "");
-  string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_"
-                     # dim # "D"
-                     # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
-}
-
-multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
-  defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
-  defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
-  defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]";
-
-  defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
-  defvar im2col_dag = !if(!eq(mode, "im2col"),
-    !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
-    (ins));
-  defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
-  defvar im2col_asm_str = ", {{" # im2col_str # "}}";
-
-  defvar asm_str = !if(!eq(mode, "im2col"),
-    !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
-
-  def "" : NVPTXInst<(outs),
-            !con((ins B64:$tmap), dims_dag, im2col_dag),
-            !strconcat(PREFETCH_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
-            Requires<[hasPTX<80>, hasSM<90>]>;
-  def _CH : NVPTXInst<(outs),
-                  !con((ins B64:$tmap), dims_dag, im2col_dag, (ins B64:$ch)),
-                  !strconcat(PREFETCH_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
-                  Requires<[hasPTX<80>, hasSM<90>]>;
-}
-
-foreach dim = [1, 2, 3, 4, 5] in {
+                     # "." # "L2.global"
+                     # "." # mode;
+
+  defvar intr = !cast<Intrinsic>(
+                  "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d);
+
+  defvar ins_dag  = !con((ins  B64:$tmap),
+                         dims_dag,
+                         im2col_dag,
+                         (ins B64:$ch));
+  defvar intr_dag = !con((intr B64:$tmap),
+                         !setdagop(dims_dag, intr),
+                         !setdagop(im2col_dag, intr),
+                         (intr B64:$ch, 0));
+  defvar intr_dag_with_ch = !con((intr B64:$tmap),
+                                 !setdagop(dims_dag, intr),
+                                 !setdagop(im2col_dag, intr),
+                                 (intr B64:$ch, -1));
+
+  def "" : NVPTXInst<(outs), ins_dag,
+             inst_name # asm_str # ";",
+             [intr_dag]>,
+             Requires<pred>;
+  def _CH : NVPTXInst<(outs), ins_dag,
+              inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+              [intr_dag_with_ch]>,
+              Requires<pred>;
+}
+foreach dim = 1...5 in {
   foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
-    defm PREFETCH_STRINGS<dim, mode, 0>.intr_name :
-      CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>;
+    defvar suffix = !toupper(mode) # "_" # dim # D;
+    defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>;
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 9fac97d97c609..d40886a56d6a4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -53,14 +53,17 @@ foreach i = 0...31 in {
 }
 
 //===----------------------------------------------------------------------===//
-//  Register classes
+//  Register classes.
+//  NOTE: if you add new vector types for a register, you must update
+//        NVPTX::packed_types() in NVPTXUtilities.h accordingly!
 //===----------------------------------------------------------------------===//
 def B1 : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
 def B16 : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
 def B32 : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32,
                               (add (sequence "R%u", 0, 4),
                               VRFrame32, VRFrameLocal32)>;
-def B64 : NVPTXRegClass<[i64, f64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
+def B64 : NVPTXRegClass<[i64, v2f32, f64], 64, (add (sequence "RL%u", 0, 4),
+                         VRFrame64, VRFrameLocal64)>;
 // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
 def B128 : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 8810feaee297a..81af55edccadb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -116,6 +116,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
 
     return HasTcgen05 && PTXVersion >= 86;
   }
+  // f32x2 instructions in Blackwell family
+  bool hasF32x2Instructions() const {
+    return SmVersion >= 100 && PTXVersion >= 86;
+  }
 
   // TMA G2S copy with cta_group::1/2 support
   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index aa7850acbd64a..9a6e261c811a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -129,8 +129,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
         Insert = false;
       }
     }
-    if (Insert && Isv2x16VT(VT)) {
-      // Can be built in a single mov
+    if (Insert && NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()) {
+      // Can be built in a single 32-bit mov (64-bit regs are emulated in SASS
+      // with 2x 32-bit regs)
       Cost += 1;
       Insert = false;
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index e792e441e49e6..88d3eefcc521e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -85,16 +85,32 @@ inline unsigned promoteScalarArgumentSize(unsigned size) {
 
 bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM);
 
-inline bool Isv2x16VT(EVT VT) {
-  return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16);
-}
-
 inline bool shouldPassAsArray(Type *Ty) {
   return Ty->isAggregateType() || Ty->isVectorTy() ||
          Ty->getScalarSizeInBits() == 128 || Ty->isHalfTy() || Ty->isBFloatTy();
 }
 
 namespace NVPTX {
+// Returns a list of vector types that we prefer to fit into a single PTX
+// register. NOTE: This must be kept in sync with the register classes
+// defined in NVPTXRegisterInfo.td.
+inline auto packed_types() {
+  static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16,
+                                   MVT::v2i16, MVT::v2f32};
+  return PackedTypes;
+}
+
+// Checks if the type VT can fit into a single register.
+inline bool isPackedVectorTy(EVT VT) {
+  return any_of(packed_types(), [VT](EVT OVT) { return OVT == VT; });
+}
+
+// Checks if two or more of the type ET can fit into a single register.
+inline bool isPackedElementTy(EVT ET) {
+  return any_of(packed_types(),
+                [ET](EVT OVT) { return OVT.getVectorElementType() == ET; });
+}
+
 inline std::string getValidPTXIdentifier(StringRef Name) {
   std::string ValidName;
   ValidName.reserve(Name.size() + 4);
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index e32d6eab3b977..47329b2c2f4d2 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_target(RISCVCodeGen
   RISCVInsertVSETVLI.cpp
   RISCVInsertWriteVXRM.cpp
   RISCVInstrInfo.cpp
+  RISCVInterleavedAccess.cpp
   RISCVISelDAGToDAG.cpp
   RISCVISelLowering.cpp
   RISCVLandingPadSetup.cpp
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7e83abe013063..6f31e889a2555 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -438,7 +438,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (!Subtarget.useCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov() &&
-      !Subtarget.hasVendorXqcicm())
+      !Subtarget.hasVendorXqcicm() && !Subtarget.hasVendorXqcics())
     setOperationAction(ISD::SELECT, XLenVT, Custom);
 
   if (Subtarget.hasVendorXqcia() && !Subtarget.is64Bit()) {
@@ -5098,12 +5098,13 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT,
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
 }
 
-// Match a mask which "spreads" the leading elements of a vector evenly
-// across the result.  Factor is the spread amount, and Index is the
-// offset applied.  (on success, Index < Factor)  This is the inverse
-// of a deinterleave with the same Factor and Index.  This is analogous
-// to an interleave, except that all but one lane is undef.
-static bool isSpreadMask(ArrayRef<int> Mask, unsigned Factor, unsigned &Index) {
+/// Match a mask which "spreads" the leading elements of a vector evenly
+/// across the result.  Factor is the spread amount, and Index is the
+/// offset applied.  (on success, Index < Factor)  This is the inverse
+/// of a deinterleave with the same Factor and Index.  This is analogous
+/// to an interleave, except that all but one lane is undef.
+bool RISCVTargetLowering::isSpreadMask(ArrayRef<int> Mask, unsigned Factor,
+                                       unsigned &Index) {
   SmallVector<bool> LaneIsUndef(Factor, true);
   for (unsigned i = 0; i < Mask.size(); i++)
     LaneIsUndef[i % Factor] &= (Mask[i] == -1);
@@ -6082,7 +6083,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       assert(MaxFactor == 2 || MaxFactor == 4 || MaxFactor == 8);
       for (unsigned Factor = 4; Factor <= MaxFactor; Factor <<= 1) {
         unsigned Index;
-        if (isSpreadMask(Mask, Factor, Index)) {
+        if (RISCVTargetLowering::isSpreadMask(Mask, Factor, Index)) {
           MVT NarrowVT =
               MVT::getVectorVT(VT.getVectorElementType(), NumElts / Factor);
           SDValue Src = DAG.getExtractSubvector(DL, NarrowVT, V1, 0);
@@ -15993,6 +15994,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     return SDValue();
   uint64_t MulAmt = CNode->getZExtValue();
 
+  // Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
+  if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+    return SDValue();
+
   const bool HasShlAdd = Subtarget.hasStdExtZba() ||
                          Subtarget.hasVendorXTHeadBa() ||
                          Subtarget.hasVendorXAndesPerf();
@@ -23752,6 +23757,10 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   auto *ConstNode = cast<ConstantSDNode>(C);
   const APInt &Imm = ConstNode->getAPIntValue();
 
+  // Don't do this if the Xqciac extension is enabled and the Imm in simm12.
+  if (Subtarget.hasVendorXqciac() && Imm.isSignedIntN(12))
+    return false;
+
   // Break the MUL to a SLLI and an ADD/SUB.
   if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
       (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
@@ -24080,39 +24089,6 @@ Value *RISCVTargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
   return TargetLowering::getIRStackGuard(IRB);
 }
 
-bool RISCVTargetLowering::isLegalInterleavedAccessType(
-    VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace,
-    const DataLayout &DL) const {
-  EVT VT = getValueType(DL, VTy);
-  // Don't lower vlseg/vsseg for vector types that can't be split.
-  if (!isTypeLegal(VT))
-    return false;
-
-  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
-      !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
-                                      Alignment))
-    return false;
-
-  MVT ContainerVT = VT.getSimpleVT();
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    if (!Subtarget.useRVVForFixedLengthVectors())
-      return false;
-    // Sometimes the interleaved access pass picks up splats as interleaves of
-    // one element. Don't lower these.
-    if (FVTy->getNumElements() < 2)
-      return false;
-
-    ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT());
-  }
-
-  // Need to make sure that EMUL * NFIELDS ≤ 8
-  auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT));
-  if (Fractional)
-    return true;
-  return Factor * LMUL <= 8;
-}
-
 bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
                                                   Align Alignment) const {
   if (!Subtarget.hasVInstructions())
@@ -24133,545 +24109,6 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
   return true;
 }
 
-static const Intrinsic::ID FixedVlsegIntrIds[] = {
-    Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask,
-    Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask,
-    Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
-    Intrinsic::riscv_seg8_load_mask};
-
-static const Intrinsic::ID ScalableVlsegIntrIds[] = {
-    Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
-    Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
-    Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
-    Intrinsic::riscv_vlseg8_mask};
-
-/// Lower an interleaved load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved load (Factor = 2):
-/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
-/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
-///
-/// Into:
-/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.seg2.load.v4i32.p0.i64(
-///                                        %ptr, i64 4)
-/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
-/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
-bool RISCVTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
-    ArrayRef<unsigned> Indices, unsigned Factor) const {
-  assert(Indices.size() == Shuffles.size());
-
-  IRBuilder<> Builder(LI);
-
-  const DataLayout &DL = LI->getDataLayout();
-
-  auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
-  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(), DL))
-    return false;
-
-  auto *PtrTy = LI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
-
-  // If the segment load is going to be performed segment at a time anyways
-  // and there's only one element used, use a strided load instead.  This
-  // will be equally fast, and create less vector register pressure.
-  if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
-    unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
-    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
-    Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
-    Value *VL = Builder.getInt32(VTy->getNumElements());
-
-    CallInst *CI =
-        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
-                                {VTy, BasePtr->getType(), Stride->getType()},
-                                {BasePtr, Stride, Mask, VL});
-    CI->addParamAttr(
-        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
-    Shuffles[0]->replaceAllUsesWith(CI);
-    return true;
-  };
-
-  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
-  Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
-  CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
-      {LI->getPointerOperand(), Mask, VL});
-
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
-    Shuffles[i]->replaceAllUsesWith(SubVec);
-  }
-
-  return true;
-}
-
-static const Intrinsic::ID FixedVssegIntrIds[] = {
-    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
-    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
-    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
-    Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
-    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
-    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
-    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
-    Intrinsic::riscv_vsseg8_mask};
-
-/// Lower an interleaved store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved store (Factor = 3):
-/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
-///                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
-/// store <12 x i32> %i.vec, <12 x i32>* %ptr
-///
-/// Into:
-/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
-/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
-/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
-/// call void llvm.riscv.seg3.store.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2,
-///                                              %ptr, i32 4)
-///
-/// Note that the new shufflevectors will be removed and we'll only generate one
-/// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
-                                                ShuffleVectorInst *SVI,
-                                                unsigned Factor) const {
-  IRBuilder<> Builder(SI);
-  const DataLayout &DL = SI->getDataLayout();
-  auto Mask = SVI->getShuffleMask();
-  auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
-  // Given SVI : <n*factor x ty>, then VTy : <n x ty>
-  auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
-                                   ShuffleVTy->getNumElements() / Factor);
-  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
-    return false;
-
-  auto *PtrTy = SI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
-
-  unsigned Index;
-  // If the segment store only has one active lane (i.e. the interleave is
-  // just a spread shuffle), we can use a strided store instead.  This will
-  // be equally fast, and create less vector register pressure.
-  if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
-      isSpreadMask(Mask, Factor, Index)) {
-    unsigned ScalarSizeInBytes =
-        DL.getTypeStoreSize(ShuffleVTy->getElementType());
-    Value *Data = SVI->getOperand(0);
-    auto *DataVTy = cast<FixedVectorType>(Data->getType());
-    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
-    Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
-    Value *VL = Builder.getInt32(VTy->getNumElements());
-
-    CallInst *CI = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vp_strided_store,
-        {Data->getType(), BasePtr->getType(), Stride->getType()},
-        {Data, BasePtr, Stride, Mask, VL});
-    CI->addParamAttr(
-        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
-
-    return true;
-  }
-
-  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
-
-  SmallVector<Value *, 10> Ops;
-  SmallVector<int, 16> NewShuffleMask;
-
-  for (unsigned i = 0; i < Factor; i++) {
-    // Collect shuffle mask for this lane.
-    for (unsigned j = 0; j < VTy->getNumElements(); j++)
-      NewShuffleMask.push_back(Mask[i + Factor * j]);
-
-    Value *Shuffle = Builder.CreateShuffleVector(
-        SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask);
-    Ops.push_back(Shuffle);
-
-    NewShuffleMask.clear();
-  }
-  // This VL should be OK (should be executable in one vsseg instruction,
-  // potentially under larger LMULs) because we checked that the fixed vector
-  // type fits in isLegalInterleavedAccessType
-  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
-  Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
-  Ops.append({SI->getPointerOperand(), StoreMask, VL});
-
-  Builder.CreateCall(VssegNFunc, Ops);
-
-  return true;
-}
-
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
-  const unsigned Factor = DeinterleaveValues.size();
-  if (Factor > 8)
-    return false;
-
-  assert(LI->isSimple());
-  IRBuilder<> Builder(LI);
-
-  Value *FirstActive =
-      *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
-  VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
-
-  const DataLayout &DL = LI->getDataLayout();
-
-  if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(), DL))
-    return false;
-
-  Value *Return;
-  Type *PtrTy = LI->getPointerOperandType();
-  Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
-    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
-    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {ResVTy, PtrTy, XLenTy},
-                                     {LI->getPointerOperand(), Mask, VL});
-  } else {
-    static const Intrinsic::ID IntrIds[] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
-
-    unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
-    unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        LI->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(LI->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Value *VL = Constant::getAllOnesValue(XLenTy);
-
-    Value *Vlseg = Builder.CreateIntrinsic(
-        IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy},
-        {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
-         ConstantInt::get(XLenTy, Log2_64(SEW))});
-
-    SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
-    Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
-    for (unsigned i = 0; i < Factor; ++i) {
-      Value *VecExtract = Builder.CreateIntrinsic(
-          Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
-          {Vlseg, Builder.getInt32(i)});
-      Return = Builder.CreateInsertValue(Return, VecExtract, i);
-    }
-  }
-
-  for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
-    if (!DIV)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIV->replaceAllUsesWith(NewEV);
-  }
-
-  return true;
-}
-
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
-    StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
-  unsigned Factor = InterleaveValues.size();
-  if (Factor > 8)
-    return false;
-
-  assert(SI->isSimple());
-  IRBuilder<> Builder(SI);
-
-  auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
-  auto *PtrTy = SI->getPointerOperandType();
-  const DataLayout &DL = SI->getDataLayout();
-
-  if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
-    return false;
-
-  Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
-    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
-
-    SmallVector<Value *, 10> Ops(InterleaveValues);
-    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
-    Ops.append({SI->getPointerOperand(), Mask, VL});
-
-    Builder.CreateCall(VssegNFunc, Ops);
-  } else {
-    static const Intrinsic::ID IntrIds[] = {
-        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
-        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
-        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
-        Intrinsic::riscv_vsseg8};
-
-    unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
-    unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        SI->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy});
-
-    Value *VL = Constant::getAllOnesValue(XLenTy);
-
-    Value *StoredVal = PoisonValue::get(VecTupTy);
-    for (unsigned i = 0; i < Factor; ++i)
-      StoredVal = Builder.CreateIntrinsic(
-          Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
-          {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
-
-    Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
-                                    ConstantInt::get(XLenTy, Log2_64(SEW))});
-  }
-
-  return true;
-}
-
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
-  assert(N);
-  if (N == 1)
-    return true;
-
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  if (match(V, m_CombineOr(m_ConstantInt(C),
-                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
-      C && C % N == 0)
-    return true;
-
-  if (isPowerOf2_32(N)) {
-    KnownBits KB = llvm::computeKnownBits(V, DL);
-    return KB.countMinTrailingZeros() >= Log2_32(N);
-  }
-
-  return false;
-}
-
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-///   %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-///                                                         %mask,
-///                                                         i32 %wide.rvl)
-///   %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.vector.deinterleave2.nxv64i8(
-///               <vscale x 64 x i8> %l)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-///   %rvl = udiv %wide.rvl, 2
-///   %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-///                                                 <vscale x 32 x i8> undef,
-///                                                 ptr %ptr,
-///                                                 %mask,
-///                                                 i64 %rvl,
-///                                                 i64 1)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
-    VPIntrinsic *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleaveResults) const {
-  const unsigned Factor = DeinterleaveResults.size();
-  assert(Mask && "Expect a valid mask");
-  assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
-         "Unexpected intrinsic");
-
-  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
-  auto &DL = Load->getModule()->getDataLayout();
-  Align Alignment = Load->getParamAlign(0).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Load);
-
-  Value *WideEVL = Load->getVectorLengthParam();
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Load->getArgOperand(0)->getType();
-  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
-
-  Value *Return = nullptr;
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {FVTy, PtrTy, XLenTy},
-                                     {Load->getArgOperand(0), Mask, EVL});
-  } else {
-    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        Load->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Value *PoisonVal = PoisonValue::get(VecTupTy);
-
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
-        {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
-    Value *Operands[] = {
-        PoisonVal,
-        Load->getArgOperand(0),
-        Mask,
-        EVL,
-        ConstantInt::get(XLenTy,
-                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
-        ConstantInt::get(XLenTy, Log2_64(SEW))};
-
-    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
-    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
-    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
-    for (unsigned i = 0; i < Factor; ++i) {
-      Value *VecExtract =
-          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
-      Return = Builder.CreateInsertValue(Return, VecExtract, i);
-    }
-  }
-
-  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
-    if (!DIO)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIO->replaceAllUsesWith(NewEV);
-  }
-
-  return true;
-}
-
-/// Lower an interleaved vp.store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.store (Factor = 2):
-///
-///   %is = tail call <vscale x 64 x i8>
-///             @llvm.vector.interleave2.nxv64i8(
-///                               <vscale x 32 x i8> %load0,
-///                               <vscale x 32 x i8> %load1
-///   %wide.rvl = shl nuw nsw i32 %rvl, 1
-///   tail call void @llvm.vp.store.nxv64i8.p0(
-///                               <vscale x 64 x i8> %is, ptr %ptr,
-///                               %mask,
-///                               i32 %wide.rvl)
-///
-/// Into:
-///   call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
-///                               <vscale x 32 x i8> %load1,
-///                               <vscale x 32 x i8> %load2, ptr %ptr,
-///                               %mask,
-///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedVPStore(
-    VPIntrinsic *Store, Value *Mask,
-    ArrayRef<Value *> InterleaveOperands) const {
-  assert(Mask && "Expect a valid mask");
-  assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
-         "Unexpected intrinsic");
-
-  const unsigned Factor = InterleaveOperands.size();
-
-  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
-  if (!VTy)
-    return false;
-
-  const DataLayout &DL = Store->getDataLayout();
-  Align Alignment = Store->getParamAlign(1).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Store);
-  Value *WideEVL = Store->getArgOperand(3);
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Store->getArgOperand(1)->getType();
-  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    SmallVector<Value *, 8> Operands(InterleaveOperands);
-    Operands.append({Store->getArgOperand(1), Mask, EVL});
-    Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
-                            {FVTy, PtrTy, XLenTy}, Operands);
-    return true;
-  }
-
-  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-  Type *VecTupTy = TargetExtType::get(
-      Store->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
-
-  Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
-  Value *StoredVal = PoisonValue::get(VecTupTy);
-  for (unsigned i = 0; i < Factor; ++i)
-    StoredVal = Builder.CreateCall(
-        VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
-
-  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), ScalableVssegIntrIds[Factor - 2],
-      {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
-  Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
-                       ConstantInt::get(XLenTy, Log2_64(SEW))};
-
-  Builder.CreateCall(VssegNFunc, Operands);
-  return true;
-}
-
 MachineInstr *
 RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
                                    MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index bcbda30342b80..00e969056df7d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -468,6 +468,12 @@ class RISCVTargetLowering : public TargetLowering {
 
   ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
 
+  /// Match a mask which "spreads" the leading elements of a vector evenly
+  /// across the result.  Factor is the spread amount, and Index is the
+  /// offset applied.
+  static bool isSpreadMask(ArrayRef<int> Mask, unsigned Factor,
+                           unsigned &Index);
+
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index b2bf09028bc40..9e3eb1c03fb37 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1312,6 +1312,30 @@ class QCIMVCCIPat<CondCode Cond, QCIMVCCI Inst, DAGOperand InTyImm>
     : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), (XLenVT GPRNoX0:$rs3), (XLenVT GPRNoX0:$rd)),
           (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, GPRNoX0:$rs3)>;
 
+class QCISELECTCCIPat<CondCode Cond, QCISELECTCCI Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), simm5:$imm, Cond)), (XLenVT GPRNoX0:$rs2), (XLenVT GPRNoX0:$rs3)),
+          (Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, GPRNoX0:$rs3)>;
+
+class QCISELECTICCIPat<CondCode Cond, QCISELECTICCI Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), simm5:$imm, Cond)), (XLenVT GPRNoX0:$rs2), simm5:$simm2),
+          (Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, simm5:$simm2)>;
+
+class QCISELECTICCIPatInv<CondCode Cond, QCISELECTICCI Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), simm5:$imm, Cond)), simm5:$simm2, (XLenVT GPRNoX0:$rs2)),
+          (Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, simm5:$simm2)>;
+
+class QCISELECTICCPat<CondCode Cond, QCISELECTICC Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), (XLenVT GPRNoX0:$rs1), Cond)), (XLenVT GPRNoX0:$rs2), simm5:$simm2),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2)>;
+
+class QCISELECTICCPatInv<CondCode Cond, QCISELECTICC Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), (XLenVT GPRNoX0:$rs1), Cond)), simm5:$simm2, (XLenVT GPRNoX0:$rs2)),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2)>;
+
+class QCISELECTIICCPat<CondCode Cond, QCISELECTIICC Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rd), (XLenVT GPRNoX0:$rs1), Cond)), simm5:$simm1, simm5:$simm2),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2)>;
+
 // Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction.
 class BcciPat<CondCode Cond, QCIBranchInst_rii Inst, DAGOperand InTyImm>
     : Pat<(riscv_brcc (XLenVT GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12),
@@ -1332,6 +1356,11 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode >
           (OpNode GPRNoX0:$lhs, InTyImm:$Constant,
            (IntCCtoRISCVCC $cc), GPRNoX0:$truev, GPRNoX0:$falsev)>;
 
+let Predicates = [HasVendorXqciac, IsRV32] in {
+def : Pat<(XLenVT (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
+          (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
+} // Predicates = [HasVendorXqciac, IsRV32]
+
 /// Simple arithmetic operations
 
 let Predicates = [HasVendorXqcilia, IsRV32] in {
@@ -1461,6 +1490,37 @@ def : QCIMVCCIPat <SETLT,  QC_MVLTI, simm5>;
 def : QCIMVCCIPat <SETULT, QC_MVLTUI, uimm5>;
 }
 
+let Predicates = [HasVendorXqcics, IsRV32] in {
+def : Pat<(select (XLenVT GPRNoX0:$rd), (XLenVT GPRNoX0:$rs2),(XLenVT GPRNoX0:$rs3)),
+          (QC_SELECTNEI GPRNoX0:$rd, (XLenVT 0), GPRNoX0:$rs2, GPRNoX0:$rs3)>;
+def : Pat<(select (XLenVT GPRNoX0:$rd), (XLenVT GPRNoX0:$rs2), simm5:$simm2),
+          (QC_SELECTINEI GPRNoX0:$rd, (XLenVT 0), GPRNoX0:$rs2, simm5:$simm2)>;
+def : Pat<(select (XLenVT GPRNoX0:$rd), simm5:$simm2,(XLenVT GPRNoX0:$rs2)),
+          (QC_SELECTIEQI GPRNoX0:$rd, (XLenVT 0), GPRNoX0:$rs2, simm5:$simm2)>;
+
+// Below AddedComplexity is added to prefer these conditional select instructions over
+// conditional move instructions
+let AddedComplexity = 1 in {
+def : QCISELECTCCIPat <SETEQ,  QC_SELECTEQI>;
+def : QCISELECTCCIPat <SETNE,  QC_SELECTNEI>;
+}
+
+def : QCISELECTICCIPat <SETEQ,  QC_SELECTIEQI>;
+def : QCISELECTICCIPat <SETNE,  QC_SELECTINEI>;
+
+def : QCISELECTICCIPatInv <SETEQ,  QC_SELECTINEI>;
+def : QCISELECTICCIPatInv <SETNE,  QC_SELECTIEQI>;
+
+def : QCISELECTICCPat <SETEQ,  QC_SELECTIEQ>;
+def : QCISELECTICCPat <SETNE,  QC_SELECTINE>;
+
+def : QCISELECTICCPatInv <SETEQ,  QC_SELECTINE>;
+def : QCISELECTICCPatInv <SETNE,  QC_SELECTIEQ>;
+
+def : QCISELECTIICCPat <SETEQ,  QC_SELECTIIEQ>;
+def : QCISELECTIICCPat <SETNE,  QC_SELECTIINE>;
+} // Predicates = [HasVendorXqcics, IsRV32]
+
 //===----------------------------------------------------------------------===/i
 // Compress Instruction tablegen backend.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
new file mode 100644
index 0000000000000..a6ff22c4b391f
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -0,0 +1,596 @@
+//===-- RISCVInterleavedAccess.cpp - RISC-V Interleaved Access Transform --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functions and callbacks related to the InterleavedAccessPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVISelLowering.h"
+#include "RISCVSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+
+using namespace llvm;
+
+bool RISCVTargetLowering::isLegalInterleavedAccessType(
+    VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace,
+    const DataLayout &DL) const {
+  EVT VT = getValueType(DL, VTy);
+  // Don't lower vlseg/vsseg for vector types that can't be split.
+  if (!isTypeLegal(VT))
+    return false;
+
+  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
+      !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
+                                      Alignment))
+    return false;
+
+  MVT ContainerVT = VT.getSimpleVT();
+
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    if (!Subtarget.useRVVForFixedLengthVectors())
+      return false;
+    // Sometimes the interleaved access pass picks up splats as interleaves of
+    // one element. Don't lower these.
+    if (FVTy->getNumElements() < 2)
+      return false;
+
+    ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT());
+  }
+
+  // Need to make sure that EMUL * NFIELDS ≤ 8
+  auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT));
+  if (Fractional)
+    return true;
+  return Factor * LMUL <= 8;
+}
+
+static const Intrinsic::ID FixedVlsegIntrIds[] = {
+    Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask,
+    Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask,
+    Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
+    Intrinsic::riscv_seg8_load_mask};
+
+static const Intrinsic::ID ScalableVlsegIntrIds[] = {
+    Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+    Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+    Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+    Intrinsic::riscv_vlseg8_mask};
+
+/// Lower an interleaved load into a vlsegN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///
+/// Into:
+/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.seg2.load.v4i32.p0.i64(
+///                                        %ptr, i64 4)
+/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool RISCVTargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Indices.size() == Shuffles.size());
+
+  IRBuilder<> Builder(LI);
+
+  const DataLayout &DL = LI->getDataLayout();
+
+  auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
+  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
+                                    LI->getPointerAddressSpace(), DL))
+    return false;
+
+  auto *PtrTy = LI->getPointerOperandType();
+  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+
+  // If the segment load is going to be performed segment at a time anyways
+  // and there's only one element used, use a strided load instead.  This
+  // will be equally fast, and create less vector register pressure.
+  if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
+    unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
+    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
+    Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
+    Value *VL = Builder.getInt32(VTy->getNumElements());
+
+    CallInst *CI =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+                                {VTy, BasePtr->getType(), Stride->getType()},
+                                {BasePtr, Stride, Mask, VL});
+    CI->addParamAttr(
+        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+    Shuffles[0]->replaceAllUsesWith(CI);
+    return true;
+  };
+
+  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
+  Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
+  CallInst *VlsegN = Builder.CreateIntrinsic(
+      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
+      {LI->getPointerOperand(), Mask, VL});
+
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
+    Shuffles[i]->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+    Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+    Intrinsic::riscv_vsseg8_mask};
+
+/// Lower an interleaved store into a vssegN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+///                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+/// call void llvm.riscv.seg3.store.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2,
+///                                              %ptr, i32 4)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// vsseg3 instruction in CodeGen.
+bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                                ShuffleVectorInst *SVI,
+                                                unsigned Factor) const {
+  IRBuilder<> Builder(SI);
+  const DataLayout &DL = SI->getDataLayout();
+  auto Mask = SVI->getShuffleMask();
+  auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
+  // Given SVI : <n*factor x ty>, then VTy : <n x ty>
+  auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
+                                   ShuffleVTy->getNumElements() / Factor);
+  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
+                                    SI->getPointerAddressSpace(), DL))
+    return false;
+
+  auto *PtrTy = SI->getPointerOperandType();
+  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+
+  unsigned Index;
+  // If the segment store only has one active lane (i.e. the interleave is
+  // just a spread shuffle), we can use a strided store instead.  This will
+  // be equally fast, and create less vector register pressure.
+  if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
+      isSpreadMask(Mask, Factor, Index)) {
+    unsigned ScalarSizeInBytes =
+        DL.getTypeStoreSize(ShuffleVTy->getElementType());
+    Value *Data = SVI->getOperand(0);
+    auto *DataVTy = cast<FixedVectorType>(Data->getType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
+    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
+    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
+    Value *VL = Builder.getInt32(VTy->getNumElements());
+
+    CallInst *CI = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vp_strided_store,
+        {Data->getType(), BasePtr->getType(), Stride->getType()},
+        {Data, BasePtr, Stride, Mask, VL});
+    CI->addParamAttr(
+        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+
+    return true;
+  }
+
+  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+      SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+
+  SmallVector<Value *, 10> Ops;
+  SmallVector<int, 16> NewShuffleMask;
+
+  for (unsigned i = 0; i < Factor; i++) {
+    // Collect shuffle mask for this lane.
+    for (unsigned j = 0; j < VTy->getNumElements(); j++)
+      NewShuffleMask.push_back(Mask[i + Factor * j]);
+
+    Value *Shuffle = Builder.CreateShuffleVector(
+        SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask);
+    Ops.push_back(Shuffle);
+
+    NewShuffleMask.clear();
+  }
+  // This VL should be OK (should be executable in one vsseg instruction,
+  // potentially under larger LMULs) because we checked that the fixed vector
+  // type fits in isLegalInterleavedAccessType
+  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
+  Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
+  Ops.append({SI->getPointerOperand(), StoreMask, VL});
+
+  Builder.CreateCall(VssegNFunc, Ops);
+
+  return true;
+}
+
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+    LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
+  const unsigned Factor = DeinterleaveValues.size();
+  if (Factor > 8)
+    return false;
+
+  assert(LI->isSimple());
+  IRBuilder<> Builder(LI);
+
+  Value *FirstActive =
+      *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
+  VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+
+  const DataLayout &DL = LI->getDataLayout();
+
+  if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
+                                    LI->getPointerAddressSpace(), DL))
+    return false;
+
+  Value *Return;
+  Type *PtrTy = LI->getPointerOperandType();
+  Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+
+  if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
+    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
+    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
+    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                     {ResVTy, PtrTy, XLenTy},
+                                     {LI->getPointerOperand(), Mask, VL});
+  } else {
+    static const Intrinsic::ID IntrIds[] = {
+        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
+        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
+        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
+        Intrinsic::riscv_vlseg8};
+
+    unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
+    unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
+    Type *VecTupTy = TargetExtType::get(
+        LI->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(LI->getContext()),
+                                NumElts * SEW / 8),
+        Factor);
+
+    Value *VL = Constant::getAllOnesValue(XLenTy);
+
+    Value *Vlseg = Builder.CreateIntrinsic(
+        IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy},
+        {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
+         ConstantInt::get(XLenTy, Log2_64(SEW))});
+
+    SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
+    Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
+    for (unsigned i = 0; i < Factor; ++i) {
+      Value *VecExtract = Builder.CreateIntrinsic(
+          Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
+          {Vlseg, Builder.getInt32(i)});
+      Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    }
+  }
+
+  for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
+    if (!DIV)
+      continue;
+    // We have to create a brand new ExtractValue to replace each
+    // of these old ExtractValue instructions.
+    Value *NewEV =
+        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+    DIV->replaceAllUsesWith(NewEV);
+  }
+
+  return true;
+}
+
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+    StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
+  unsigned Factor = InterleaveValues.size();
+  if (Factor > 8)
+    return false;
+
+  assert(SI->isSimple());
+  IRBuilder<> Builder(SI);
+
+  auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
+  auto *PtrTy = SI->getPointerOperandType();
+  const DataLayout &DL = SI->getDataLayout();
+
+  if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
+                                    SI->getPointerAddressSpace(), DL))
+    return false;
+
+  Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+
+  if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
+    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+        SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
+
+    SmallVector<Value *, 10> Ops(InterleaveValues);
+    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
+    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
+    Ops.append({SI->getPointerOperand(), Mask, VL});
+
+    Builder.CreateCall(VssegNFunc, Ops);
+  } else {
+    static const Intrinsic::ID IntrIds[] = {
+        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
+        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
+        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
+        Intrinsic::riscv_vsseg8};
+
+    unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
+    unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
+    Type *VecTupTy = TargetExtType::get(
+        SI->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
+                                NumElts * SEW / 8),
+        Factor);
+
+    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+        SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy});
+
+    Value *VL = Constant::getAllOnesValue(XLenTy);
+
+    Value *StoredVal = PoisonValue::get(VecTupTy);
+    for (unsigned i = 0; i < Factor; ++i)
+      StoredVal = Builder.CreateIntrinsic(
+          Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
+          {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
+
+    Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
+                                    ConstantInt::get(XLenTy, Log2_64(SEW))});
+  }
+
+  return true;
+}
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+  assert(N);
+  if (N == 1)
+    return true;
+
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
+  if (isPowerOf2_32(N)) {
+    KnownBits KB = llvm::computeKnownBits(V, DL);
+    return KB.countMinTrailingZeros() >= Log2_32(N);
+  }
+
+  return false;
+}
+
+/// Lower an interleaved vp.load into a vlsegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.load (Factor = 2):
+///   %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
+///                                                         %mask,
+///                                                         i32 %wide.rvl)
+///   %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+///             @llvm.vector.deinterleave2.nxv64i8(
+///               <vscale x 64 x i8> %l)
+///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
+///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
+///
+/// Into:
+///   %rvl = udiv %wide.rvl, 2
+///   %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+///             @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
+///                                                 <vscale x 32 x i8> undef,
+///                                                 ptr %ptr,
+///                                                 %mask,
+///                                                 i64 %rvl,
+///                                                 i64 1)
+///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
+///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
+///
+/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
+/// removed by the caller
+/// TODO: We probably can loosen the dependency on matching extractvalue when
+/// dealing with factor of 2 (extractvalue is still required for most of other
+/// factors though).
+bool RISCVTargetLowering::lowerInterleavedVPLoad(
+    VPIntrinsic *Load, Value *Mask,
+    ArrayRef<Value *> DeinterleaveResults) const {
+  const unsigned Factor = DeinterleaveResults.size();
+  assert(Mask && "Expect a valid mask");
+  assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
+         "Unexpected intrinsic");
+
+  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
+                                      [](Value *V) { return V != nullptr; });
+  VectorType *VTy = cast<VectorType>(FirstActive->getType());
+
+  auto &DL = Load->getModule()->getDataLayout();
+  Align Alignment = Load->getParamAlign(0).value_or(
+      DL.getABITypeAlign(VTy->getElementType()));
+  if (!isLegalInterleavedAccessType(
+          VTy, Factor, Alignment,
+          Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
+    return false;
+
+  IRBuilder<> Builder(Load);
+
+  Value *WideEVL = Load->getVectorLengthParam();
+  // Conservatively check if EVL is a multiple of factor, otherwise some
+  // (trailing) elements might be lost after the transformation.
+  if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
+    return false;
+
+  auto *PtrTy = Load->getArgOperand(0)->getType();
+  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+  Value *EVL = Builder.CreateZExt(
+      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+      XLenTy);
+
+  Value *Return = nullptr;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                     {FVTy, PtrTy, XLenTy},
+                                     {Load->getArgOperand(0), Mask, EVL});
+  } else {
+    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+    Type *VecTupTy = TargetExtType::get(
+        Load->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+                                NumElts * SEW / 8),
+        Factor);
+
+    Value *PoisonVal = PoisonValue::get(VecTupTy);
+
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
+        {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
+
+    Value *Operands[] = {
+        PoisonVal,
+        Load->getArgOperand(0),
+        Mask,
+        EVL,
+        ConstantInt::get(XLenTy,
+                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
+        ConstantInt::get(XLenTy, Log2_64(SEW))};
+
+    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+
+    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+    for (unsigned i = 0; i < Factor; ++i) {
+      Value *VecExtract =
+          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+      Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    }
+  }
+
+  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
+    if (!DIO)
+      continue;
+    // We have to create a brand new ExtractValue to replace each
+    // of these old ExtractValue instructions.
+    Value *NewEV =
+        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+    DIO->replaceAllUsesWith(NewEV);
+  }
+
+  return true;
+}
+
+/// Lower an interleaved vp.store into a vssegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.store (Factor = 2):
+///
+///   %is = tail call <vscale x 64 x i8>
+///             @llvm.vector.interleave2.nxv64i8(
+///                               <vscale x 32 x i8> %load0,
+///                               <vscale x 32 x i8> %load1
+///   %wide.rvl = shl nuw nsw i32 %rvl, 1
+///   tail call void @llvm.vp.store.nxv64i8.p0(
+///                               <vscale x 64 x i8> %is, ptr %ptr,
+///                               %mask,
+///                               i32 %wide.rvl)
+///
+/// Into:
+///   call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
+///                               <vscale x 32 x i8> %load1,
+///                               <vscale x 32 x i8> %load2, ptr %ptr,
+///                               %mask,
+///                               i64 %rvl)
+bool RISCVTargetLowering::lowerInterleavedVPStore(
+    VPIntrinsic *Store, Value *Mask,
+    ArrayRef<Value *> InterleaveOperands) const {
+  assert(Mask && "Expect a valid mask");
+  assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
+         "Unexpected intrinsic");
+
+  const unsigned Factor = InterleaveOperands.size();
+
+  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
+  if (!VTy)
+    return false;
+
+  const DataLayout &DL = Store->getDataLayout();
+  Align Alignment = Store->getParamAlign(1).value_or(
+      DL.getABITypeAlign(VTy->getElementType()));
+  if (!isLegalInterleavedAccessType(
+          VTy, Factor, Alignment,
+          Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
+    return false;
+
+  IRBuilder<> Builder(Store);
+  Value *WideEVL = Store->getArgOperand(3);
+  // Conservatively check if EVL is a multiple of factor, otherwise some
+  // (trailing) elements might be lost after the transformation.
+  if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
+    return false;
+
+  auto *PtrTy = Store->getArgOperand(1)->getType();
+  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+  Value *EVL = Builder.CreateZExt(
+      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+      XLenTy);
+
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    SmallVector<Value *, 8> Operands(InterleaveOperands);
+    Operands.append({Store->getArgOperand(1), Mask, EVL});
+    Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
+                            {FVTy, PtrTy, XLenTy}, Operands);
+    return true;
+  }
+
+  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+  Type *VecTupTy = TargetExtType::get(
+      Store->getContext(), "riscv.vector.tuple",
+      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+                              NumElts * SEW / 8),
+      Factor);
+
+  Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
+      Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
+  Value *StoredVal = PoisonValue::get(VecTupTy);
+  for (unsigned i = 0; i < Factor; ++i)
+    StoredVal = Builder.CreateCall(
+        VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
+
+  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+      Store->getModule(), ScalableVssegIntrIds[Factor - 2],
+      {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
+
+  Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
+                       ConstantInt::get(XLenTy, Log2_64(SEW))};
+
+  Builder.CreateCall(VssegNFunc, Operands);
+  return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 24c05a2f807d0..d257f56cf4129 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -323,6 +323,12 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
         Worklist.push_back(std::make_pair(UserMI, Bits));
         break;
 
+      case RISCV::BREV8:
+      case RISCV::ORC_B:
+        // BREV8 and ORC_B work on bytes. Round Bits down to the nearest byte.
+        Worklist.push_back(std::make_pair(UserMI, alignDown(Bits, 8)));
+        break;
+
       case RISCV::PseudoCCMOVGPR:
       case RISCV::PseudoCCMOVGPRNoX0:
         // Either operand 4 or operand 5 is returned by this instruction. If
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 07907298386c3..84ef53985484f 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -521,16 +521,23 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const {
 /// Check if it's safe to move From down to To, checking that no physical
 /// registers are clobbered.
 static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) {
-  assert(From.getParent() == To.getParent() && !From.hasImplicitDef());
-  SmallVector<Register> PhysUses;
+  assert(From.getParent() == To.getParent());
+  SmallVector<Register> PhysUses, PhysDefs;
   for (const MachineOperand &MO : From.all_uses())
     if (MO.getReg().isPhysical())
       PhysUses.push_back(MO.getReg());
+  for (const MachineOperand &MO : From.all_defs())
+    if (MO.getReg().isPhysical())
+      PhysDefs.push_back(MO.getReg());
   bool SawStore = false;
-  for (auto II = From.getIterator(); II != To.getIterator(); II++) {
+  for (auto II = std::next(From.getIterator()); II != To.getIterator(); II++) {
     for (Register PhysReg : PhysUses)
       if (II->definesRegister(PhysReg, nullptr))
         return false;
+    for (Register PhysReg : PhysDefs)
+      if (II->definesRegister(PhysReg, nullptr) ||
+          II->readsRegister(PhysReg, nullptr))
+        return false;
     if (II->mayStore()) {
       SawStore = true;
       break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index b2ea784057780..ec95e86e4fe3d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -35,10 +35,8 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
 
 using namespace llvm;
-using namespace PatternMatch;
 
 #define DEBUG_TYPE "wasm-fastisel"
 
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 8e304c07ed5cb..7fe58539cd4ec 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -108,6 +109,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .legalFor(HasSSE2 || UseX87, {s64})
       .legalFor(UseX87, {s80});
 
+  getActionDefinitionsBuilder(G_GET_ROUNDING).customFor({s32});
+
   // merge/unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
@@ -611,6 +614,8 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
     return legalizeSITOFP(MI, MRI, Helper);
   case TargetOpcode::G_FPTOSI:
     return legalizeFPTOSI(MI, MRI, Helper);
+  case TargetOpcode::G_GET_ROUNDING:
+    return legalizeGETROUNDING(MI, MRI, Helper);
   }
   llvm_unreachable("expected switch to return");
 }
@@ -777,6 +782,82 @@ bool X86LegalizerInfo::legalizeNarrowingStore(MachineInstr &MI,
   return true;
 }
 
+bool X86LegalizerInfo::legalizeGETROUNDING(MachineInstr &MI,
+                                           MachineRegisterInfo &MRI,
+                                           LegalizerHelper &Helper) const {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  GET_ROUNDING, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we use a packed lookup table of the four 2-bit
+  values that we can index by FPSP[11:10]
+    0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
+
+    (0x2d >> ((FPSR >> 9) & 6)) & 3
+  */
+
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineFunction &MF = MIRBuilder.getMF();
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+
+  // Save FP Control Word to stack slot
+  int MemSize = 2;
+  Align Alignment = Align(2);
+  MachinePointerInfo PtrInfo;
+  auto StackTemp = Helper.createStackTemporary(TypeSize::getFixed(MemSize),
+                                               Alignment, PtrInfo);
+  Register StackPtr = StackTemp.getReg(0);
+
+  auto StoreMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                          MemSize, Alignment);
+
+  // Store FP Control Word to stack slot using G_FNSTCW16
+  MIRBuilder.buildInstr(X86::G_FNSTCW16)
+      .addUse(StackPtr)
+      .addMemOperand(StoreMMO);
+
+  // Load FP Control Word from stack slot
+  auto LoadMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                         MemSize, Alignment);
+
+  auto CWD32 =
+      MIRBuilder.buildZExt(s32, MIRBuilder.buildLoad(s16, StackPtr, *LoadMMO));
+  auto Shifted8 = MIRBuilder.buildTrunc(
+      s8, MIRBuilder.buildLShr(s32, CWD32, MIRBuilder.buildConstant(s8, 9)));
+  auto Masked32 = MIRBuilder.buildZExt(
+      s32, MIRBuilder.buildAnd(s8, Shifted8, MIRBuilder.buildConstant(s8, 6)));
+
+  // LUT is a packed lookup table (0x2d) used to map the 2-bit x87 FPU rounding
+  // mode (from bits 11:10 of the control word) to the values expected by
+  // GET_ROUNDING. The mapping is performed by shifting LUT right by the
+  // extracted rounding mode and masking the result with 3 to obtain the final
+  auto LUT = MIRBuilder.buildConstant(s32, 0x2d);
+  auto LUTShifted = MIRBuilder.buildLShr(s32, LUT, Masked32);
+  auto RetVal =
+      MIRBuilder.buildAnd(s32, LUTShifted, MIRBuilder.buildConstant(s32, 3));
+  auto RetValTrunc = MIRBuilder.buildZExtOrTrunc(DstTy, RetVal);
+
+  MIRBuilder.buildCopy(Dst, RetValTrunc);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 1ba82674ed4c6..0003552d70ee0 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -54,6 +54,9 @@ class X86LegalizerInfo : public LegalizerInfo {
 
   bool legalizeFPTOSI(MachineInstr &MI, MachineRegisterInfo &MRI,
                       LegalizerHelper &Helper) const;
+
+  bool legalizeGETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           LegalizerHelper &Helper) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 772e48efb8607..990b381341f07 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1331,7 +1331,7 @@ def ProcessorFeatures {
   // Pantherlake
   list<SubtargetFeature> PTLAdditionalFeatures = [FeaturePREFETCHI];
   list<SubtargetFeature> PTLFeatures =
-    !listconcat(ARLSFeatures, PTLAdditionalFeatures);
+    !listremove(!listconcat(ARLSFeatures, PTLAdditionalFeatures), [FeatureWIDEKL]);
 
 
   // Clearwaterforest
@@ -1342,7 +1342,7 @@ def ProcessorFeatures {
                                                   FeatureSM4,
                                                   FeatureUSERMSR];
   list<SubtargetFeature> CWFFeatures =
-    !listconcat(SRFFeatures, CWFAdditionalFeatures);
+    !listremove(!listconcat(SRFFeatures, CWFAdditionalFeatures), [FeatureWIDEKL]);
 
   // Knights Landing
   list<SubtargetFeature> KNLFeatures = [FeatureX87,
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 50c20fcde49ce..d406277e440bb 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -192,9 +192,9 @@ void X86AsmPrinter::emitKCFITypeId(const MachineFunction &MF) {
   unsigned DestReg = X86::EAX;
 
   if (F.getParent()->getModuleFlag("kcfi-arity")) {
-    // The ArityToRegMap assumes the 64-bit Linux kernel ABI
+    // The ArityToRegMap assumes the 64-bit SysV ABI.
     [[maybe_unused]] const auto &Triple = MF.getTarget().getTargetTriple();
-    assert(Triple.isArch64Bit() && Triple.isOSLinux());
+    assert(Triple.isArch64Bit() && !Triple.isOSWindows());
 
     // Determine the function's arity (i.e., the number of arguments) at the ABI
     // level by counting the number of parameters that are passed
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 215906d9df8b3..823e0caa02262 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -832,6 +832,7 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[f64], CCAssignToStack<8, 4>>,
 
   // Long doubles get slots whose size and alignment depends on the subtarget.
+  CCIfSubtarget<"isTargetDarwin()", CCIfType<[f80], CCAssignToStack<0, 4>>>,
   CCIfType<[f80], CCAssignToStack<0, 0>>,
 
   // Boolean vectors of AVX-512 are passed in SIMD registers.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ad1b47a94d28..5e35d5630d667 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4975,6 +4975,16 @@ X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
   return getTargetConstantFromNode(LD);
 }
 
+bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const {
+  // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
+  SDValue Cond = N->getOperand(0);
+  SDValue RHS = N->getOperand(2);
+  EVT CondVT = Cond.getValueType();
+  return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
+         CondVT.getVectorElementType() == MVT::i1 &&
+         ISD::isBuildVectorAllZeros(RHS.getNode());
+}
+
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                           APInt &UndefElts,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3039b7eeb38ff..6bcb7a36e91b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1356,6 +1356,8 @@ namespace llvm {
              TargetLowering::isTargetCanonicalConstantNode(Op);
     }
 
+    bool isTargetCanonicalSelect(SDNode *N) const override;
+
     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
diff --git a/llvm/lib/Target/X86/X86InstrGISel.td b/llvm/lib/Target/X86/X86InstrGISel.td
index f4fa33807cd9a..39198214037a3 100644
--- a/llvm/lib/Target/X86/X86InstrGISel.td
+++ b/llvm/lib/Target/X86/X86InstrGISel.td
@@ -27,5 +27,13 @@ def G_FIST : X86GenericInstruction {
   let mayStore = true;
 }
 
+def G_FNSTCW16 : X86GenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins ptype0:$dst);
+  let hasSideEffects = true;
+  let mayStore = true;
+}
+
 def : GINodeEquiv<G_FILD, X86fild>;
 def : GINodeEquiv<G_FIST, X86fp_to_mem>;
+def : GINodeEquiv<G_FNSTCW16, X86fp_cwd_get16>;
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 94812e4e60c3d..57fbc71fa22ee 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -176,10 +176,10 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
-    FeaturesArrowlakeS | FeaturePREFETCHI;
+    FeaturesArrowlakeS ^ FeatureWIDEKL | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
-    FeaturesSierraforest | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
-    FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
+    FeaturesSierraforest ^ FeatureWIDEKL | FeatureAVXVNNIINT16 | FeatureSHA512 |
+    FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
 
 // Geode Processor.
 constexpr FeatureBitset FeaturesGeode =
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 7224a56cd7b8a..fe30c6dc6abe4 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -639,10 +639,10 @@ static DIType *solveDIType(DIBuilder &Builder, Type *Ty,
     SmallVector<Metadata *, 16> Elements;
     for (unsigned I = 0; I < StructTy->getNumElements(); I++) {
       DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout,
-                                 Scope, LineNum, DITypeCache);
+                                 DIStruct, LineNum, DITypeCache);
       assert(DITy);
       Elements.push_back(Builder.createMemberType(
-          Scope, DITy->getName(), Scope->getFile(), LineNum,
+          DIStruct, DITy->getName(), DIStruct->getFile(), LineNum,
           DITy->getSizeInBits(), DITy->getAlignInBits(),
           Layout.getStructLayout(StructTy)->getElementOffsetInBits(I),
           llvm::DINode::FlagArtificial, DITy));
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index c0f84456d2b27..469f435374793 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -95,6 +95,8 @@ STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
 STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
 STATISTIC(MissingAllocForContextId,
           "Number of missing alloc nodes for context ids");
+STATISTIC(SkippedCallsCloning,
+          "Number of calls skipped during cloning due to unexpected operand");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -181,6 +183,12 @@ static cl::opt<bool> AllowRecursiveContexts(
     "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
     cl::desc("Allow cloning of contexts having recursive cycles"));
 
+// Set the minimum absolute count threshold for allowing inlining of indirect
+// calls promoted during cloning.
+static cl::opt<unsigned> MemProfICPNoInlineThreshold(
+    "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
+    cl::desc("Minimum absolute count for promoted target to be inlinable"));
+
 namespace llvm {
 cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
@@ -5155,6 +5163,19 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
 
       assert(!isMemProfClone(*CalledFunction));
 
+      // Because we update the cloned calls by calling setCalledOperand (see
+      // comment below), out of an abundance of caution make sure the called
+      // function was actually the called operand (or its aliasee). We also
+      // strip pointer casts when looking for calls (to match behavior during
+      // summary generation), however, with opaque pointers in theory this
+      // should not be an issue. Note we still clone the current function
+      // (containing this call) above, as that could be needed for its callers.
+      auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
+      if (CalledFunction != CB->getCalledOperand() &&
+          (!GA || CalledFunction != GA->getAliaseeObject())) {
+        SkippedCallsCloning++;
+        return;
+      }
       // Update the calls per the summary info.
       // Save orig name since it gets updated in the first iteration
       // below.
@@ -5173,7 +5194,13 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
           CBClone = CB;
         else
           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
-        CBClone->setCalledFunction(NewF);
+        // Set the called operand directly instead of calling setCalledFunction,
+        // as the latter mutates the function type on the call. In rare cases
+        // we may have a slightly different type on a callee function
+        // declaration due to it being imported from a different module with
+        // incomplete types. We really just want to change the name of the
+        // function to the clone, and not make any type changes.
+        CBClone->setCalledOperand(NewF.getCallee());
         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
                  << ore::NV("Call", CBClone) << " in clone "
                  << ore::NV("Caller", CBClone->getFunction())
@@ -5573,6 +5600,15 @@ void MemProfContextDisambiguation::performICP(
                                  .getCallee());
         }
         DirectCall.setCalledFunction(TargetToUse);
+        // During matching we generate synthetic VP metadata for indirect calls
+        // not already having any, from the memprof profile's callee GUIDs. If
+        // we subsequently promote and inline those callees, we currently lose
+        // the ability to generate this synthetic VP metadata. Optionally apply
+        // a noinline attribute to promoted direct calls, where the threshold is
+        // set to capture synthetic VP metadata targets which get a count of 1.
+        if (MemProfICPNoInlineThreshold &&
+            Candidate.Count < MemProfICPNoInlineThreshold)
+          DirectCall.setIsNoInline();
         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
                  << ore::NV("Call", CBClone) << " in clone "
                  << ore::NV("Caller", CBClone->getFunction())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 706cb828acc63..3beda6bc5ba38 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3717,6 +3717,30 @@ Value *InstCombinerImpl::reassociateDisjointOr(Value *LHS, Value *RHS) {
   return nullptr;
 }
 
+/// Fold Res, Overflow = (umul.with.overflow x c1); (or Overflow (ugt Res c2))
+/// --> (ugt x (c2/c1)). This code checks whether a multiplication of two
+/// unsigned numbers (one is a constant) is mathematically greater than a
+/// second constant.
+static Value *foldOrUnsignedUMulOverflowICmp(BinaryOperator &I,
+                                             InstCombiner::BuilderTy &Builder,
+                                             const DataLayout &DL) {
+  Value *WOV, *X;
+  const APInt *C1, *C2;
+  if (match(&I,
+            m_c_Or(m_ExtractValue<1>(
+                       m_CombineAnd(m_Intrinsic<Intrinsic::umul_with_overflow>(
+                                        m_Value(X), m_APInt(C1)),
+                                    m_Value(WOV))),
+                   m_OneUse(m_SpecificCmp(ICmpInst::ICMP_UGT,
+                                          m_ExtractValue<0>(m_Deferred(WOV)),
+                                          m_APInt(C2))))) &&
+      !C1->isZero()) {
+    Constant *NewC = ConstantInt::get(X->getType(), C2->udiv(*C1));
+    return Builder.CreateICmp(ICmpInst::ICMP_UGT, X, NewC);
+  }
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -4150,6 +4174,11 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     }
   }
 
+  // Try to fold the pattern "Overflow | icmp pred Res, C2" into a single
+  // comparison instruction for umul.with.overflow.
+  if (Value *R = foldOrUnsignedUMulOverflowICmp(I, Builder, DL))
+    return replaceInstUsesWith(I, R);
+
   // (~x) | y  -->  ~(x & (~y))  iff that gets rid of inversions
   if (sinkNotIntoOtherHandOfLogicalOp(I))
     return &I;
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 666f3e51cb30f..5f5200b2c9e62 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4569,16 +4569,37 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
-  // Instrument abs intrinsic.
-  // handleUnknownIntrinsic can't handle it because of the last
-  // is_int_min_poison argument which does not match the result type.
+  // Instrument @llvm.abs intrinsic.
+  //
+  // e.g., i32       @llvm.abs.i32  (i32       <Src>, i1 <is_int_min_poison>)
+  //       <4 x i32> @llvm.abs.v4i32(<4 x i32> <Src>, i1 <is_int_min_poison>)
   void handleAbsIntrinsic(IntrinsicInst &I) {
+    assert(I.arg_size() == 2);
+    Value *Src = I.getArgOperand(0);
+    Value *IsIntMinPoison = I.getArgOperand(1);
+
     assert(I.getType()->isIntOrIntVectorTy());
-    assert(I.getArgOperand(0)->getType() == I.getType());
 
-    // FIXME: Handle is_int_min_poison.
+    assert(Src->getType() == I.getType());
+
+    assert(IsIntMinPoison->getType()->isIntegerTy());
+    assert(IsIntMinPoison->getType()->getIntegerBitWidth() == 1);
+
     IRBuilder<> IRB(&I);
-    setShadow(&I, getShadow(&I, 0));
+    Value *SrcShadow = getShadow(Src);
+
+    APInt MinVal =
+        APInt::getSignedMinValue(Src->getType()->getScalarSizeInBits());
+    Value *MinValVec = ConstantInt::get(Src->getType(), MinVal);
+    Value *SrcIsMin = IRB.CreateICmp(CmpInst::ICMP_EQ, Src, MinValVec);
+
+    Value *PoisonedShadow = getPoisonedShadow(Src);
+    Value *PoisonedIfIntMinShadow =
+        IRB.CreateSelect(SrcIsMin, PoisonedShadow, SrcShadow);
+    Value *Shadow =
+        IRB.CreateSelect(IsIntMinPoison, PoisonedIfIntMinShadow, SrcShadow);
+
+    setShadow(&I, Shadow);
     setOrigin(&I, getOrigin(&I, 0));
   }
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5b2ca8c5915ff..a69d64956d6d9 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -129,7 +128,6 @@ struct PredInfo {
 using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
-using Val2BBMap = DenseMap<Value *, BasicBlock *>;
 
 // A traits type that is intended to be used in graph algorithms. The graph
 // traits starts at an entry node, and traverses the RegionNodes that are in
@@ -281,7 +279,7 @@ class StructurizeCFG {
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
   Value *BoolPoison;
-  const TargetTransformInfo *TTI;
+
   Function *Func;
   Region *ParentRegion;
 
@@ -303,12 +301,8 @@ class StructurizeCFG {
   PredMap LoopPreds;
   BranchVector LoopConds;
 
-  Val2BBMap HoistedValues;
-
   RegionNode *PrevNode;
 
-  void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
-
   void orderNodes();
 
   void analyzeLoops(RegionNode *N);
@@ -338,8 +332,6 @@ class StructurizeCFG {
 
   void simplifyAffectedPhis();
 
-  void simplifyHoistedPhis();
-
   DebugLoc killTerminator(BasicBlock *BB);
 
   void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -367,7 +359,7 @@ class StructurizeCFG {
 
 public:
   void init(Region *R);
-  bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
+  bool run(Region *R, DominatorTree *DT);
   bool makeUniformRegion(Region *R, UniformityInfo &UA);
 };
 
@@ -393,11 +385,8 @@ class StructurizeCFGLegacyPass : public RegionPass {
       if (SCFG.makeUniformRegion(R, UA))
         return false;
     }
-    Function *F = R->getEntry()->getParent();
-    const TargetTransformInfo *TTI =
-        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    return SCFG.run(R, DT, TTI);
+    return SCFG.run(R, DT);
   }
 
   StringRef getPassName() const override { return "Structurize control flow"; }
@@ -405,9 +394,7 @@ class StructurizeCFGLegacyPass : public RegionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     if (SkipUniformRegions)
       AU.addRequired<UniformityInfoWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
@@ -416,34 +403,6 @@ class StructurizeCFGLegacyPass : public RegionPass {
 
 } // end anonymous namespace
 
-/// Checks whether an instruction is zero cost instruction and checks if the
-/// operands are from different BB. If so, this instruction can be coalesced
-/// if its hoisted to predecessor block. So, this returns true.
-static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
-                                   const TargetTransformInfo *TTI) {
-  if (I->getParent() != BB)
-    return false;
-
-  // If the instruction is not a zero cost instruction, return false.
-  auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
-  InstructionCost::CostType CostVal =
-      Cost.isValid()
-          ? Cost.getValue()
-          : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
-  if (CostVal != 0)
-    return false;
-
-  // Check if any operands are instructions defined in the same block.
-  for (auto &Op : I->operands()) {
-    if (auto *OpI = dyn_cast<Instruction>(Op)) {
-      if (OpI->getParent() == BB)
-        return false;
-    }
-  }
-
-  return true;
-}
-
 char StructurizeCFGLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -454,39 +413,6 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
                     "Structurize the CFG", false, false)
 
-/// Structurization can introduce unnecessary VGPR copies due to register
-/// coalescing interference. For example, if the Else block has a zero-cost
-/// instruction and the Then block modifies the VGPR value, only one value is
-/// live at a time in merge block before structurization. After structurization,
-/// the coalescer may incorrectly treat the Then value as live in the Else block
-/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
-///
-/// This function examines phi nodes whose incoming values are zero-cost
-/// instructions in the Else block. It identifies such values that can be safely
-/// hoisted and moves them to the nearest common dominator of Then and Else
-/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
-/// value to poison phi nodes along the if→flow edge, aiding register coalescing
-/// and minimizing unnecessary live ranges.
-void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
-                                                     BasicBlock *ThenBB) {
-
-  BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
-  BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);
-
-  if (!ElseSucc || !CommonDominator)
-    return;
-  Instruction *Term = CommonDominator->getTerminator();
-  for (PHINode &Phi : ElseSucc->phis()) {
-    Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
-    auto *Inst = dyn_cast<Instruction>(ElseVal);
-    if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
-      continue;
-    Inst->removeFromParent();
-    Inst->insertInto(CommonDominator, Term->getIterator());
-    HoistedValues[Inst] = CommonDominator;
-  }
-}
-
 /// Build up the general order of nodes, by performing a topological sort of the
 /// parent region's nodes, while ensuring that there is no outer cycle node
 /// between any two inner cycle nodes.
@@ -609,7 +535,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
             BasicBlock *Other = Term->getSuccessor(!i);
             if (Visited.count(Other) && !Loops.count(Other) &&
                 !Pred.count(Other) && !Pred.count(P)) {
-              hoistZeroCostElseBlockPhiValues(Succ, Other);
+
               Pred[Other] = {BoolFalse, std::nullopt};
               Pred[P] = {BoolTrue, std::nullopt};
               continue;
@@ -965,44 +891,6 @@ void StructurizeCFG::setPhiValues() {
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
 
-/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
-/// entries on Flow nodes with the appropriate hoisted values
-void StructurizeCFG::simplifyHoistedPhis() {
-  for (WeakVH VH : AffectedPhis) {
-    PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
-    if (!Phi || Phi->getNumIncomingValues() != 2)
-      continue;
-
-    for (int i = 0; i < 2; i++) {
-      Value *V = Phi->getIncomingValue(i);
-      auto BBIt = HoistedValues.find(V);
-
-      if (BBIt == HoistedValues.end())
-        continue;
-
-      Value *OtherV = Phi->getIncomingValue(!i);
-      PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
-      if (!OtherPhi)
-        continue;
-
-      int PoisonValBBIdx = -1;
-      for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
-        if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
-          continue;
-        PoisonValBBIdx = i;
-        break;
-      }
-      if (PoisonValBBIdx == -1 ||
-          !DT->dominates(BBIt->second,
-                         OtherPhi->getIncomingBlock(PoisonValBBIdx)))
-        continue;
-
-      OtherPhi->setIncomingValue(PoisonValBBIdx, V);
-      Phi->setIncomingValue(i, OtherV);
-    }
-  }
-}
-
 void StructurizeCFG::simplifyAffectedPhis() {
   bool Changed;
   do {
@@ -1395,13 +1283,12 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 }
 
 /// Run the transformation for each region found
-bool StructurizeCFG::run(Region *R, DominatorTree *DT,
-                         const TargetTransformInfo *TTI) {
+bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   if (R->isTopLevelRegion())
     return false;
 
   this->DT = DT;
-  this->TTI = TTI;
+
   Func = R->getEntry()->getParent();
   assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
@@ -1413,7 +1300,6 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT,
   insertConditions(false);
   insertConditions(true);
   setPhiValues();
-  simplifyHoistedPhis();
   simplifyConditions();
   simplifyAffectedPhis();
   rebuildSSA();
@@ -1463,7 +1349,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
   bool Changed = false;
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto &RI = AM.getResult<RegionInfoAnalysis>(F);
-  TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
   UniformityInfo *UI = nullptr;
   if (SkipUniformRegions)
     UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1482,7 +1368,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
       continue;
     }
 
-    Changed |= SCFG.run(R, DT, TTI);
+    Changed |= SCFG.run(R, DT);
   }
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 992f98cec0010..fb6640d5cfcf8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -175,6 +175,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
 
 static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -7205,6 +7206,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
          "Trying to execute plan with unsupported VF");
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
+  if (BestVPlan.hasEarlyExit())
+    ++LoopsEarlyExitVectorized;
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
@@ -10061,8 +10064,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
-  if (LVL.hasUncountableEarlyExit() && UserIC != 1 &&
-      !VectorizerParams::isInterleaveForced()) {
+  if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
     UserIC = 1;
     reportVectorizationInfo("Interleaving not supported for loops "
                             "with uncountable early exits",
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d3761ff43f437..c61e1135524b6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3898,7 +3898,7 @@ class BoUpSLP {
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
-    int findLaneForValue(Value *V) const {
+    unsigned findLaneForValue(Value *V) const {
       unsigned FoundLane = getVectorFactor();
       for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
            std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
 
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
-    ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
+    ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
         : Scalar(S), User(U), E(E), Lane(L) {}
 
     /// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
     const TreeEntry &E;
 
     /// Which lane does the scalar belong to.
-    int Lane;
+    unsigned Lane;
   };
   using UserList = SmallVector<ExternalUser, 16>;
 
@@ -5809,48 +5809,40 @@ static InstructionCost getExtractWithExtendCost(
   return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
 }
 
-/// Correctly creates insert_subvector, checking that the index is multiple of
-/// the subvectors length. Otherwise, generates shuffle using \p Generator or
+/// Creates subvector insert. Generates shuffle using \p Generator or
 /// using default shuffle.
 static Value *createInsertVector(
     IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
     function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
+  if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
+    return Vec;
   const unsigned SubVecVF = getNumElements(V->getType());
-  if (Index % SubVecVF == 0) {
-    Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, Index);
-  } else {
-    // Create shuffle, insertvector requires that index is multiple of
-    // the subvector length.
-    const unsigned VecVF = getNumElements(Vec->getType());
-    SmallVector<int> Mask(VecVF, PoisonMaskElem);
-    std::iota(Mask.begin(), Mask.end(), 0);
-    for (unsigned I : seq<unsigned>(SubVecVF))
-      Mask[I + Index] = I + VecVF;
-    if (Generator) {
-      Vec = Generator(Vec, V, Mask);
-    } else {
-      // 1. Resize V to the size of Vec.
-      SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
-      std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
-      V = Builder.CreateShuffleVector(V, ResizeMask);
-      Vec = Builder.CreateShuffleVector(Vec, V, Mask);
-    }
+  // Create shuffle, insertvector requires that index is multiple of
+  // the subvector length.
+  const unsigned VecVF = getNumElements(Vec->getType());
+  SmallVector<int> Mask(VecVF, PoisonMaskElem);
+  if (isa<PoisonValue>(Vec)) {
+    auto *Begin = std::next(Mask.begin(), Index);
+    std::iota(Begin, std::next(Begin, SubVecVF), 0);
+    Vec = Builder.CreateShuffleVector(V, Mask);
+    return Vec;
   }
-  return Vec;
+  std::iota(Mask.begin(), Mask.end(), 0);
+  std::iota(std::next(Mask.begin(), Index),
+            std::next(Mask.begin(), Index + SubVecVF), VecVF);
+  if (Generator)
+    return Generator(Vec, V, Mask);
+  // 1. Resize V to the size of Vec.
+  SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
+  std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
+  V = Builder.CreateShuffleVector(V, ResizeMask);
+  // 2. Insert V into Vec.
+  return Builder.CreateShuffleVector(Vec, V, Mask);
 }
 
-/// Correctly creates extract_subvector, checking that the index is multiple of
-/// the subvectors length. Otherwise, generates shuffle using \p Generator or
-/// using default shuffle.
+/// Generates subvector extract using \p Generator or using default shuffle.
 static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
                                   unsigned SubVecVF, unsigned Index) {
-  if (Index % SubVecVF == 0) {
-    VectorType *SubVecTy =
-        getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
-    return Builder.CreateExtractVector(SubVecTy, Vec, Index);
-  }
-  // Create shuffle, extract_subvector requires that index is multiple of
-  // the subvector length.
   SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
   std::iota(Mask.begin(), Mask.end(), Index);
   return Builder.CreateShuffleVector(Vec, Mask);
@@ -7901,7 +7893,7 @@ void BoUpSLP::buildExternalUses(
       // Check if the scalar is externally used as an extra arg.
       const auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
-        int FoundLane = Entry->findLaneForValue(Scalar);
+        unsigned FoundLane = Entry->findLaneForValue(Scalar);
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
                           << FoundLane << " from " << *Scalar << ".\n");
         ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7941,7 @@ void BoUpSLP::buildExternalUses(
 
         if (U && Scalar->hasNUsesOrMore(UsesLimit))
           U = nullptr;
-        int FoundLane = Entry->findLaneForValue(Scalar);
+        unsigned FoundLane = Entry->findLaneForValue(Scalar);
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
                           << " from lane " << FoundLane << " from " << *Scalar
                           << ".\n");
@@ -16275,8 +16267,8 @@ Value *BoUpSLP::gather(
       assert(SLPReVec && "FixedVectorType is not expected.");
       Vec =
           createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
-      auto *II = dyn_cast<IntrinsicInst>(Vec);
-      if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+      auto *II = dyn_cast<Instruction>(Vec);
+      if (!II)
         return Vec;
       InsElt = II;
     } else {
@@ -16296,6 +16288,28 @@ Value *BoUpSLP::gather(
           if (auto *SI = dyn_cast<Instruction>(Scalar))
             UserOp = SI;
         } else {
+          if (V->getType()->isVectorTy()) {
+            if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
+                SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
+              // Find shufflevector, caused by resize.
+              auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
+                if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
+                  if (SV->getOperand(0) == V)
+                    return SV;
+                  if (SV->getOperand(1) == V)
+                    return SV;
+                }
+                return nullptr;
+              };
+              InsElt = nullptr;
+              if (Instruction *User = FindOperand(SV->getOperand(0), V))
+                InsElt = User;
+              else if (Instruction *User = FindOperand(SV->getOperand(1), V))
+                InsElt = User;
+              assert(InsElt &&
+                     "Failed to find shufflevector, caused by resize.");
+            }
+          }
           UserOp = InsElt;
         }
         if (UserOp) {
@@ -16864,10 +16878,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
                                          V, SimplifyQuery(*R.DL));
                                    }));
           unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
+          // Use scalar version of the SCalarType to correctly handle shuffles
+          // for revectorization. The revectorization mode operates by the
+          // vectors, but here we need to operate on the scalars, because the
+          // masks were already transformed for the vector elements and we don't
+          // need doing this transformation again.
+          Type *OrigScalarTy = ScalarTy;
+          ScalarTy = ScalarTy->getScalarType();
           Vec = createInsertVector(
               Builder, Vec, V, InsertionIndex,
               std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
                         _3));
+          ScalarTy = OrigScalarTy;
           if (!CommonMask.empty()) {
             std::iota(std::next(CommonMask.begin(), Idx),
                       std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
@@ -21722,6 +21744,8 @@ class HorizontalReduction {
   /// Checks if the optimization of original scalar identity operations on
   /// matched horizontal reductions is enabled and allowed.
   bool IsSupportedHorRdxIdentityOp = false;
+  /// The minimum number of the reduced values.
+  const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
   /// Contains vector values for reduction including their scale factor and
   /// signedness.
   SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -21740,7 +21764,8 @@ class HorizontalReduction {
   }
 
   /// Checks if instruction is associative and can be vectorized.
-  static bool isVectorizable(RecurKind Kind, Instruction *I) {
+  static bool isVectorizable(RecurKind Kind, Instruction *I,
+                             bool TwoElementReduction = false) {
     if (Kind == RecurKind::None)
       return false;
 
@@ -21749,6 +21774,10 @@ class HorizontalReduction {
         isBoolLogicOp(I))
       return true;
 
+    // No need to check for associativity, if 2 reduced values.
+    if (TwoElementReduction)
+      return true;
+
     if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
       // FP min/max are associative except for NaN and -0.0. We do not
       // have to rule out -0.0 here because the intrinsic semantics do not
@@ -22020,6 +22049,27 @@ class HorizontalReduction {
 
 public:
   HorizontalReduction() = default;
+  HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+      : ReductionRoot(I), ReductionLimit(2) {
+    RdxKind = HorizontalReduction::getRdxKind(I);
+    ReductionOps.emplace_back().push_back(I);
+    ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+    for (Value *V : Ops)
+      ReducedValsToOps[V].push_back(I);
+  }
+
+  bool matchReductionForOperands() const {
+    // Analyze "regular" integer/FP types for reductions - no target-specific
+    // types or pointers.
+    assert(ReductionRoot && "Reduction root is not set!");
+    if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
+                        all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
+                          return Ops.size() == 2;
+                        })))
+      return false;
+
+    return true;
+  }
 
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22187,7 +22237,6 @@ class HorizontalReduction {
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
                      const TargetLibraryInfo &TLI, AssumptionCache *AC) {
-    const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -22521,8 +22570,10 @@ class HorizontalReduction {
           continue;
         }
         V.reorderTopToBottom();
-        // No need to reorder the root node at all.
-        V.reorderBottomToTop(/*IgnoreReorder=*/true);
+        // No need to reorder the root node at all for reassociative reduction.
+        V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
+                             VL.front()->getType()->isIntOrIntVectorTy() ||
+                             ReductionLimit > 2);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
         BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
@@ -23736,15 +23787,61 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
       Candidates.emplace_back(A1, B);
   }
 
+  auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
+                                             ArrayRef<Value *> Ops) {
+    if (!isReductionCandidate(Inst))
+      return false;
+    Type *Ty = Inst->getType();
+    if (!isValidElementType(Ty) || Ty->isPointerTy())
+      return false;
+    HorizontalReduction HorRdx(Inst, Ops);
+    if (!HorRdx.matchReductionForOperands())
+      return false;
+    // Check the cost of operations.
+    VectorType *VecTy = getWidenedType(Ty, Ops.size());
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    InstructionCost ScalarCost =
+        TTI.getScalarizationOverhead(
+            VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+            /*Extract=*/true, CostKind) +
+        TTI.getInstructionCost(Inst, CostKind);
+    InstructionCost RedCost;
+    switch (::getRdxKind(Inst)) {
+    case RecurKind::Add:
+    case RecurKind::Mul:
+    case RecurKind::Or:
+    case RecurKind::And:
+    case RecurKind::Xor:
+    case RecurKind::FAdd:
+    case RecurKind::FMul: {
+      FastMathFlags FMF;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+        FMF = FPCI->getFastMathFlags();
+      RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+                                               CostKind);
+      break;
+    }
+    default:
+      return false;
+    }
+    if (RedCost >= ScalarCost)
+      return false;
+
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+  };
   if (Candidates.size() == 1)
-    return tryToVectorizeList({Op0, Op1}, R);
+    return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
 
   // We have multiple options. Try to pick the single best.
   std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
   if (!BestCandidate)
     return false;
-  return tryToVectorizeList(
-      {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
+  return (*BestCandidate == 0 &&
+          TryToReduce(I, {Candidates[*BestCandidate].first,
+                          Candidates[*BestCandidate].second})) ||
+         tryToVectorizeList({Candidates[*BestCandidate].first,
+                             Candidates[*BestCandidate].second},
+                            R);
 }
 
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9a6e4b36397b3..85741b977bb77 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4183,7 +4183,8 @@ class VPlan {
   /// block with multiple predecessors (one for the exit via the latch and one
   /// via the other early exit).
   bool hasEarlyExit() const {
-    return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1;
+    return ExitBlocks.size() > 1 ||
+           (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
   }
 
   /// Returns true if the scalar tail may execute after the vector loop. Note
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75ade13b09d9c..3c367664a0988 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3055,8 +3055,7 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
                                                  VPCostContext &Ctx) const {
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  const Align Alignment =
-      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
   unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
@@ -3196,10 +3195,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
   // don't need to compare to the legacy cost model.
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  const Align Alignment =
-      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
-  unsigned AS =
-      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  unsigned AS = getLoadStoreAddressSpace(&Ingredient);
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
       Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
@@ -3309,10 +3306,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
   // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
   // don't need to compare to the legacy cost model.
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  const Align Alignment =
-      getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
-  unsigned AS =
-      getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  unsigned AS = getLoadStoreAddressSpace(&Ingredient);
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
       Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 02cea8620d271..6a3b3e6e41955 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3083,16 +3083,15 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
 }
 
 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
-/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
-/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
-/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
-/// be narrowed to an index-independent load if it feeds all wide ops at all
-/// indices (\p OpV must be the operand at index \p OpIdx for both the recipe at
-/// lane 0, \p WideMember0, and \p WideMember). A VPInterleaveRecipe can be
-/// narrowed to a wide load, if \p V is defined at \p Idx of a load interleave
-/// group.
-static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
-                          unsigned OpIdx, VPValue *OpV, unsigned Idx) {
+/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
+/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
+/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
+/// an index-independent load if it feeds all wide ops at all indices (\p OpV
+/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
+/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
+/// is defined at \p Idx of a load interleave group.
+static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
+                          VPValue *OpV, unsigned Idx) {
   auto *DefR = OpV->getDefiningRecipe();
   if (!DefR)
     return WideMember0->getOperand(OpIdx) == OpV;
@@ -3165,6 +3164,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
       continue;
 
+    if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
+        vputils::onlyFirstLaneUsed(cast<VPSingleDefRecipe>(&R)))
+      continue;
+
     // Bail out on recipes not supported at the moment:
     //  * phi recipes other than the canonical induction
     //  * recipes writing to memory except interleave groups
@@ -3236,9 +3239,9 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
           R->getNumOperands() > 2)
         return;
       if (any_of(enumerate(R->operands()),
-                 [WideMember0, Idx = I, R](const auto &P) {
+                 [WideMember0, Idx = I](const auto &P) {
                    const auto &[OpIdx, OpV] = P;
-                   return !canNarrowLoad(WideMember0, R, OpIdx, OpV, Idx);
+                   return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
                  }))
         return;
     }
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 39debd8e3dddc..4bb4818cc53ef 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -978,6 +978,122 @@ define void @store() {
   ret void
 }
 
+define void @gather() {
+; ARGBASED-LABEL: 'gather'
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.gather.v2i8.v2p0(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x i8> @llvm.vp.gather.v4i8.v4p0(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <8 x i8> @llvm.vp.gather.v8i8.v8p0(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <16 x i8> @llvm.vp.gather.v16i8.v16p0(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %5 = call <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %6 = call <4 x i64> @llvm.vp.gather.v4i64.v4p0(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %7 = call <8 x i64> @llvm.vp.gather.v8i64.v8p0(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %8 = call <16 x i64> @llvm.vp.gather.v16i64.v16p0(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 2 x i8> @llvm.vp.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %10 = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %11 = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %12 = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: %13 = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: %14 = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: %15 = call <vscale x 8 x i64> @llvm.vp.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 16 x i64> @llvm.vp.gather.nxv16i64.nxv16p0(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'gather'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %1 = call <2 x i8> @llvm.vp.gather.v2i8.v2p0(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %2 = call <4 x i8> @llvm.vp.gather.v4i8.v4p0(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %3 = call <8 x i8> @llvm.vp.gather.v8i8.v8p0(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %4 = call <16 x i8> @llvm.vp.gather.v16i8.v16p0(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %5 = call <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %6 = call <4 x i64> @llvm.vp.gather.v4i64.v4p0(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %7 = call <8 x i64> @llvm.vp.gather.v8i64.v8p0(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %8 = call <16 x i64> @llvm.vp.gather.v16i64.v16p0(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 2 x i8> @llvm.vp.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %11 = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %12 = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %13 = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %14 = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %15 = call <vscale x 8 x i64> @llvm.vp.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 16 x i64> @llvm.vp.gather.nxv16i64.nxv16p0(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call <2 x i8> @llvm.vp.gather(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+  call <4 x i8> @llvm.vp.gather(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+  call <8 x i8> @llvm.vp.gather(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+  call <16 x i8> @llvm.vp.gather(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+  call <2 x i64> @llvm.vp.gather(<2 x ptr> poison, <2 x i1> poison, i32 poison)
+  call <4 x i64> @llvm.vp.gather(<4 x ptr> poison, <4 x i1> poison, i32 poison)
+  call <8 x i64> @llvm.vp.gather(<8 x ptr> poison, <8 x i1> poison, i32 poison)
+  call <16 x i64> @llvm.vp.gather(<16 x ptr> poison, <16 x i1> poison, i32 poison)
+  call <vscale x 2 x i8> @llvm.vp.gather(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+  call <vscale x 4 x i8> @llvm.vp.gather(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+  call <vscale x 8 x i8> @llvm.vp.gather(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+  call <vscale x 16 x i8> @llvm.vp.gather(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+  call <vscale x 2 x i64> @llvm.vp.gather(<vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+  call <vscale x 4 x i64> @llvm.vp.gather(<vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+  call <vscale x 8 x i64> @llvm.vp.gather(<vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+  call <vscale x 16 x i64> @llvm.vp.gather(<vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+  ret void
+}
+
+define void @scatter() {
+; ARGBASED-LABEL: 'scatter'
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.scatter.v2i8.v2p0(<2 x i8> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.scatter.v4i8.v4p0(<4 x i8> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.scatter.v8i8.v8p0(<8 x i8> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.scatter.v16i8.v16p0(<16 x i8> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.scatter.v2i64.v2p0(<2 x i64> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.vp.scatter.v4i64.v4p0(<4 x i64> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.vp.scatter.v8i64.v8p0(<8 x i64> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.vp.scatter.v16i64.v16p0(<16 x i64> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv16i64.nxv16p0(<vscale x 16 x i64> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'scatter'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.scatter.v2i8.v2p0(<2 x i8> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.vp.scatter.v4i8.v4p0(<4 x i8> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.vp.scatter.v8i8.v8p0(<8 x i8> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.vp.scatter.v16i8.v16p0(<16 x i8> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.scatter.v2i64.v2p0(<2 x i64> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.vp.scatter.v4i64.v4p0(<4 x i64> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.vp.scatter.v8i64.v8p0(<8 x i64> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.vp.scatter.v16i64.v16p0(<16 x i64> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.vp.scatter.nxv16i64.nxv16p0(<vscale x 16 x i64> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.vp.scatter(<2 x i8> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<4 x i8> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<8 x i8> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<16 x i8> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<2 x i64> poison, <2 x ptr> poison, <2 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<4 x i64> poison, <4 x ptr> poison, <4 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<8 x i64> poison, <8 x ptr> poison, <8 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<16 x i64> poison, <16 x ptr> poison, <16 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 2 x i8> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 4 x i8> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 8 x i8> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 16 x i8> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 2 x i64> poison, <vscale x 2 x ptr> poison, <vscale x 2 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 4 x i64> poison, <vscale x 4 x ptr> poison, <vscale x 4 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 8 x i64> poison, <vscale x 8 x ptr> poison, <vscale x 8 x i1> poison, i32 poison)
+  call void @llvm.vp.scatter(<vscale x 16 x i64> poison, <vscale x 16 x ptr> poison, <vscale x 16 x i1> poison, i32 poison)
+  ret void
+}
+
 define void @strided_load() {
 ; ARGBASED-LABEL: 'strided_load'
 ; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
diff --git a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
index 6768e9067dca3..d3301520fd107 100644
--- a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -802,7 +802,7 @@ define void @banerjee9(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; CHECK-NEXT:    da analyze - output [* *]!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
-; CHECK-NEXT:    da analyze - flow [<= =|<]!
+; CHECK-NEXT:    da analyze - flow [<= 0|<]!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %1 = load i64, ptr %arrayidx7, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
@@ -816,7 +816,7 @@ define void @banerjee9(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; NORMALIZE-NEXT:    da analyze - output [* *]!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
-; NORMALIZE-NEXT:    da analyze - flow [<= =|<]!
+; NORMALIZE-NEXT:    da analyze - flow [<= 0|<]!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; NORMALIZE-NEXT:    da analyze - confused!
 ; NORMALIZE-NEXT:  Src: %1 = load i64, ptr %arrayidx7, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
@@ -830,7 +830,7 @@ define void @banerjee9(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; DELIN-NEXT:    da analyze - output [* *]!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
-; DELIN-NEXT:    da analyze - flow [<= =|<]!
+; DELIN-NEXT:    da analyze - flow [<= 0|<]!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; DELIN-NEXT:    da analyze - confused!
 ; DELIN-NEXT:  Src: %1 = load i64, ptr %arrayidx7, align 8 --> Dst: %1 = load i64, ptr %arrayidx7, align 8
@@ -888,7 +888,7 @@ define void @banerjee10(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
-; CHECK-NEXT:    da analyze - flow [<> =]!
+; CHECK-NEXT:    da analyze - flow [<> 0]!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %1 = load i64, ptr %arrayidx6, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
@@ -902,7 +902,7 @@ define void @banerjee10(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; NORMALIZE-NEXT:    da analyze - none!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
-; NORMALIZE-NEXT:    da analyze - flow [<> =]!
+; NORMALIZE-NEXT:    da analyze - flow [<> 0]!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; NORMALIZE-NEXT:    da analyze - confused!
 ; NORMALIZE-NEXT:  Src: %1 = load i64, ptr %arrayidx6, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
@@ -916,7 +916,7 @@ define void @banerjee10(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; DELIN-NEXT:    da analyze - none!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
-; DELIN-NEXT:    da analyze - flow [<> =]!
+; DELIN-NEXT:    da analyze - flow [<> 0]!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %1, ptr %B.addr.11, align 8
 ; DELIN-NEXT:    da analyze - confused!
 ; DELIN-NEXT:  Src: %1 = load i64, ptr %arrayidx6, align 8 --> Dst: %1 = load i64, ptr %arrayidx6, align 8
@@ -1058,7 +1058,7 @@ define void @banerjee12(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
-; CHECK-NEXT:    da analyze - flow [= <>]!
+; CHECK-NEXT:    da analyze - flow [0 <>]!
 ; CHECK-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %0, ptr %B.addr.11, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i64, ptr %arrayidx6, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
@@ -1072,7 +1072,7 @@ define void @banerjee12(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; NORMALIZE-NEXT:    da analyze - none!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
-; NORMALIZE-NEXT:    da analyze - flow [= <>]!
+; NORMALIZE-NEXT:    da analyze - flow [0 <>]!
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %0, ptr %B.addr.11, align 8
 ; NORMALIZE-NEXT:    da analyze - confused!
 ; NORMALIZE-NEXT:  Src: %0 = load i64, ptr %arrayidx6, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
@@ -1086,7 +1086,7 @@ define void @banerjee12(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
 ; DELIN-NEXT:    da analyze - none!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
-; DELIN-NEXT:    da analyze - flow [= <>]!
+; DELIN-NEXT:    da analyze - flow [0 <>]!
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 %0, ptr %B.addr.11, align 8
 ; DELIN-NEXT:    da analyze - confused!
 ; DELIN-NEXT:  Src: %0 = load i64, ptr %arrayidx6, align 8 --> Dst: %0 = load i64, ptr %arrayidx6, align 8
diff --git a/llvm/test/Analysis/DependenceAnalysis/Coupled.ll b/llvm/test/Analysis/DependenceAnalysis/Coupled.ll
index ff9f393f88152..06bfc5d2e8573 100644
--- a/llvm/test/Analysis/DependenceAnalysis/Coupled.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/Coupled.ll
@@ -285,7 +285,7 @@ define void @couple6(ptr %A, ptr %B, i32 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx1, align 4 --> Dst: store i32 %conv, ptr %arrayidx1, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
-; CHECK-NEXT:    da analyze - flow [=|<]!
+; CHECK-NEXT:    da analyze - flow [0|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx3, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
@@ -503,7 +503,7 @@ define void @couple11(ptr %A, ptr %B, i32 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx2, align 4 --> Dst: store i32 %conv, ptr %arrayidx2, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx4, align 4
-; CHECK-NEXT:    da analyze - flow [=|<] splitable!
+; CHECK-NEXT:    da analyze - flow [0|<] splitable!
 ; CHECK-NEXT:    da analyze - split level = 1, iteration = 9!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - confused!
@@ -636,7 +636,7 @@ define void @couple14(ptr %A, ptr %B, i32 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx3, align 4 --> Dst: store i32 %conv, ptr %arrayidx3, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx3, align 4 --> Dst: %0 = load i32, ptr %arrayidx6, align 4
-; CHECK-NEXT:    da analyze - flow [=|<]!
+; CHECK-NEXT:    da analyze - flow [0|<]!
 ; CHECK-NEXT:  Src: store i32 %conv, ptr %arrayidx3, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx6, align 4 --> Dst: %0 = load i32, ptr %arrayidx6, align 4
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
index f0cd2fd4cd930..e5d5d21e365a1 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
@@ -18,7 +18,7 @@ define void @i32_subscript(ptr %a, ptr %b) {
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %a.addr, align 4 --> Dst: %0 = load i32, ptr %a.addr, align 4
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %a.addr, align 4 --> Dst: store i32 %1, ptr %a.addr.2, align 4
-; CHECK-NEXT:    da analyze - anti [=|<]!
+; CHECK-NEXT:    da analyze - anti [0|<]!
 ; CHECK-NEXT:  Src: store i32 %1, ptr %a.addr.2, align 4 --> Dst: store i32 %1, ptr %a.addr.2, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
diff --git a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
index 86ec915bd03ba..83b37da759b5c 100644
--- a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
@@ -3,30 +3,36 @@
 
 ; CHECK: ; ModuleID = 'debuginfo.c'
 ; CHECK-NEXT: source_filename = "debuginfo.c"
-
-; CHECK:      define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !36 {
+ 
+; CHECK:      define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !44 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:     #dbg_declare(i64 0, !43, !DIExpression(), !50)
-; CHECK-NEXT:     #dbg_declare(i64 0, !44, !DIExpression(), !50)
-; CHECK-NEXT:     #dbg_declare(i64 0, !45, !DIExpression(), !50)
-; CHECK-NEXT:     #dbg_label(!51, !50)
+; CHECK-NEXT:     #dbg_declare(i64 0, !49, !DIExpression(), !58)
+; CHECK-NEXT:     #dbg_declare(i64 0, !50, !DIExpression(), !58)
+; CHECK-NEXT:     #dbg_declare(i64 0, !51, !DIExpression(), !58)
+; CHECK-NEXT:     #dbg_label(!59, !58)
 ; CHECK-NEXT:   br label %vars
-; CHECK-NEXT:     #dbg_label(!52, !50)
+; CHECK-NEXT:     #dbg_label(!60, !58)
 ; CHECK-NEXT:   br label %vars
-; CHECK:      vars:
+ 
+; CHECK:      vars:                                             ; preds = %entry, %entry
 ; CHECK-NEXT:   %p1 = phi i64 [ 0, %entry ]
 ; CHECK-NEXT:   %p2 = phi i64 [ 0, %entry ]
-; CHECK-NEXT:     #dbg_value(i64 0, !46, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !53)
-; CHECK-NEXT:     #dbg_value(i64 1, !48, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !53)
+; CHECK-NEXT:     #dbg_value(i64 0, !42, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !61)
+; CHECK-NEXT:     #dbg_value(i64 1, !52, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !61)
 ; CHECK-NEXT:   %a = add i64 %p1, %p2
 ; CHECK-NEXT:   ret i64 0
 ; CHECK-NEXT: }
-
+ 
 ; CHECK:      !llvm.dbg.cu = !{!0}
 ; CHECK-NEXT: !FooType = !{!33}
 ; CHECK-NEXT: !EnumTest = !{!3}
 ; CHECK-NEXT: !LargeEnumTest = !{!11}
-
+; CHECK-NEXT: !SubrangeType = !{!36}
+; CHECK-NEXT: !SetType1 = !{!37}
+; CHECK-NEXT: !SetType2 = !{!38}
+; CHECK-NEXT: !DynType = !{!39}
+; CHECK-NEXT: !ClassType = !{!54}
+ 
 ; CHECK:      !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !28, splitDebugInlining: false, sysroot: "/")
 ; CHECK-NEXT: !1 = !DIFile(filename: "debuginfo.c", directory: ".")
 ; CHECK-NEXT: !2 = !{!3, !11}
@@ -63,21 +69,29 @@
 ; CHECK-NEXT: !33 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 192, dwarfAddressSpace: 0)
 ; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !35, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
 ; CHECK-NEXT: !35 = !{!6, !6, !6}
-; CHECK-NEXT: !36 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !37, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !42)
-; CHECK-NEXT: !37 = !DISubroutineType(types: !38)
-; CHECK-NEXT: !38 = !{!6, !6, !39}
-; CHECK-NEXT: !39 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !40)
+; CHECK-NEXT: !36 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
+; CHECK-NEXT: !37 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
+; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !36, size: 64)
+; CHECK-NEXT: !39 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !40, dataLocation: !DIExpression(), associated: !42, rank: !DIExpression())
 ; CHECK-NEXT: !40 = !{!41}
 ; CHECK-NEXT: !41 = !DISubrange(count: 10, lowerBound: 0)
-; CHECK-NEXT: !42 = !{!43, !44, !45, !46, !48, !49}
-; CHECK-NEXT: !43 = !DILocalVariable(name: "a", arg: 1, scope: !36, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !44 = !DILocalVariable(name: "b", arg: 2, scope: !36, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !45 = !DILocalVariable(name: "c", arg: 3, scope: !36, file: !1, line: 42, type: !39)
-; CHECK-NEXT: !46 = !DILocalVariable(name: "d", scope: !47, file: !1, line: 43, type: !6)
-; CHECK-NEXT: !47 = distinct !DILexicalBlock(scope: !36, file: !1, line: 42)
-; CHECK-NEXT: !48 = !DILocalVariable(name: "e", scope: !47, file: !1, line: 44, type: !6)
-; CHECK-NEXT: !49 = !DILabel(scope: !36, name: "label3", file: !1, line: 42)
-; CHECK-NEXT: !50 = !DILocation(line: 42, scope: !36)
-; CHECK-NEXT: !51 = !DILabel(scope: !36, name: "label1", file: !1, line: 42)
-; CHECK-NEXT: !52 = !DILabel(scope: !36, name: "label2", file: !1, line: 42)
-; CHECK-NEXT: !53 = !DILocation(line: 43, scope: !36)
+; CHECK-NEXT: !42 = !DILocalVariable(name: "d", scope: !43, file: !1, line: 43, type: !6)
+; CHECK-NEXT: !43 = distinct !DILexicalBlock(scope: !44, file: !1, line: 42)
+; CHECK-NEXT: !44 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !45, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !48)
+; CHECK-NEXT: !45 = !DISubroutineType(types: !46)
+; CHECK-NEXT: !46 = !{!6, !6, !47}
+; CHECK-NEXT: !47 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !40)
+; CHECK-NEXT: !48 = !{!49, !50, !51, !42, !52, !53}
+; CHECK-NEXT: !49 = !DILocalVariable(name: "a", arg: 1, scope: !44, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !50 = !DILocalVariable(name: "b", arg: 2, scope: !44, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !51 = !DILocalVariable(name: "c", arg: 3, scope: !44, file: !1, line: 42, type: !47)
+; CHECK-NEXT: !52 = !DILocalVariable(name: "e", scope: !43, file: !1, line: 44, type: !6)
+; CHECK-NEXT: !53 = !DILabel(scope: !44, name: "label3", file: !1, line: 42)
+; CHECK-NEXT: !54 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !55, identifier: "FooClass")
+; CHECK-NEXT: !55 = !{!56}
+; CHECK-NEXT: !56 = !{!6, !6, !57}
+; CHECK-NEXT: !57 = !DIBasicType(name: "Int32", size: 32)
+; CHECK-NEXT: !58 = !DILocation(line: 42, scope: !44)
+; CHECK-NEXT: !59 = !DILabel(scope: !44, name: "label1", file: !1, line: 42)
+; CHECK-NEXT: !60 = !DILabel(scope: !44, name: "label2", file: !1, line: 42)
+; CHECK-NEXT: !61 = !DILocation(line: 43, scope: !44)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d0424f2e400fc..bd2d8c095831b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -636,6 +636,9 @@
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: G_GET_ROUNDING (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_PTR_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
diff --git a/llvm/test/CodeGen/AArch64/ldst_update_cfpath.mir b/llvm/test/CodeGen/AArch64/ldst_update_cfpath.mir
new file mode 100644
index 0000000000000..8f541af2eb1a3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst_update_cfpath.mir
@@ -0,0 +1,386 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-none-linux-gnu"
+
+  ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16)
+  define dso_local void @test(ptr noundef captures(none) %data) local_unnamed_addr #0 {
+  entry:
+    br i1 undef, label %while.cond.preheader, label %for.body.preheader
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  while.cond.preheader.loopexit:                    ; preds = %for.body
+    br label %while.cond.preheader
+
+  while.cond.preheader:                             ; preds = %while.cond.preheader.loopexit, %entry
+    br i1 undef, label %while.body.lr.ph.lr.ph, label %for.cond28.preheader
+
+  while.body.lr.ph.lr.ph:                           ; preds = %while.cond.preheader
+    br label %while.body.preheader
+
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    br i1 undef, label %for.body, label %while.cond.preheader.loopexit
+
+  for.cond28.preheader:                             ; preds = %if.then, %if.end, %while.cond.preheader
+    br i1 undef, label %for.end45, label %for.body36.preheader
+
+  for.body36.preheader:                             ; preds = %for.cond28.preheader
+    br label %for.body36
+
+  while.body:                                       ; preds = %if.end, %while.body.preheader
+    br i1 undef, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %while.body
+    br i1 undef, label %while.body.preheader, label %for.cond28.preheader
+
+  while.body.preheader:                             ; preds = %if.then, %while.body.lr.ph.lr.ph
+    br label %while.body
+
+  if.end:                                           ; preds = %while.body
+    br i1 undef, label %for.cond28.preheader, label %while.body
+
+  for.body36:                                       ; preds = %for.body36.preheader, %for.body36
+    br i1 undef, label %for.body36, label %for.end45
+
+  for.end45:                                        ; preds = %for.body36, %for.cond28.preheader
+    ret void
+  }
+
+...
+---
+name:            test
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: true
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w8 = LDRBBui renamable $x0, 4
+  ; CHECK-NEXT:   TBNZW killed renamable $w8, 3, %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w11 = MOVZWi 1, 0
+  ; CHECK-NEXT:   renamable $w9 = MOVZWi 1, 0
+  ; CHECK-NEXT:   renamable $w8 = MOVZWi 2, 0, implicit-def $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.while.cond.preheader:
+  ; CHECK-NEXT:   successors: %bb.3(0x60000000), %bb.8(0x20000000)
+  ; CHECK-NEXT:   liveins: $w9, $w11, $x0, $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w11, 299, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 8, %bb.8, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.while.cond.preheader:
+  ; CHECK-NEXT:   successors: %bb.4(0x55555555), %bb.8(0x2aaaaaab)
+  ; CHECK-NEXT:   liveins: $w9, $x0, $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x10 = nuw ADDXrx renamable $x0, renamable $w9, 18
+  ; CHECK-NEXT:   renamable $w11 = LDRWui renamable $x10, 0
+  ; CHECK-NEXT:   TBZW killed renamable $w11, 3, %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.14(0x80000000)
+  ; CHECK-NEXT:   liveins: $w9, $x0, $x8, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w9 = ORRWrs $wzr, killed renamable $w9, 0, implicit-def $x9
+  ; CHECK-NEXT:   B %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.for.body.preheader:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x10 = ADDXri renamable $x0, 8, 0
+  ; CHECK-NEXT:   renamable $w9 = MOVZWi 1, 0
+  ; CHECK-NEXT:   renamable $w12 = MOVZWi 2, 0, implicit-def $x12
+  ; CHECK-NEXT:   $x11 = ORRXrs $xzr, $x10, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.for.body:
+  ; CHECK-NEXT:   successors: %bb.7(0x7e000000), %bb.2(0x02000000)
+  ; CHECK-NEXT:   liveins: $w9, $x0, $x10, $x11, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w8 = LDURWi renamable $x11, -4
+  ; CHECK-NEXT:   STRWui killed renamable $w8, renamable $x11, 0
+  ; CHECK-NEXT:   renamable $x8 = nuw nsw ADDXri renamable $x12, 2, 0
+  ; CHECK-NEXT:   renamable $w9 = nuw nsw ADDWri killed renamable $w9, 1, 0
+  ; CHECK-NEXT:   early-clobber renamable $x10, renamable $w13 = LDRBBpost killed renamable $x10, 4
+  ; CHECK-NEXT:   TBZW killed renamable $w13, 3, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.for.body:
+  ; CHECK-NEXT:   successors: %bb.6(0x7df7df7e), %bb.2(0x02082082)
+  ; CHECK-NEXT:   liveins: $w9, $x0, $x8, $x10, $x11, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x11, 8, 0
+  ; CHECK-NEXT:   dead $xzr = SUBSXri killed renamable $x12, 598, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   $x12 = ORRXrs $xzr, $x8, 0
+  ; CHECK-NEXT:   Bcc 0, %bb.2, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.for.cond28.preheader:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0, $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w9 = LDRBBui renamable $x0, 4
+  ; CHECK-NEXT:   TBZW killed renamable $w9, 3, %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9.for.body36.preheader:
+  ; CHECK-NEXT:   successors: %bb.10(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w9 = MOVZWi 8, 0, implicit-def $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10.for.body36:
+  ; CHECK-NEXT:   successors: %bb.11(0x7e000000), %bb.12(0x02000000)
+  ; CHECK-NEXT:   liveins: $x0, $x8, $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w10 = SUBWri renamable $w8, 1, 0
+  ; CHECK-NEXT:   renamable $w10 = LDRWroW renamable $x0, killed renamable $w10, 0, 1
+  ; CHECK-NEXT:   early-clobber renamable $x8 = STRWpost killed renamable $w10, renamable $x8, 2
+  ; CHECK-NEXT:   renamable $w10 = LDRBBroX renamable $x0, renamable $x9, 0, 0
+  ; CHECK-NEXT:   TBZW killed renamable $w10, 3, %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11.for.body36:
+  ; CHECK-NEXT:   successors: %bb.10(0x7df7df7e), %bb.12(0x02082082)
+  ; CHECK-NEXT:   liveins: $x0, $x8, $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $xzr = SUBSXri renamable $x9, 1200, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   renamable $x9 = nuw nsw ADDXri killed renamable $x9, 4, 0
+  ; CHECK-NEXT:   Bcc 1, %bb.10, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12.for.end45:
+  ; CHECK-NEXT:   RET undef $lr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13.if.end:
+  ; CHECK-NEXT:   successors: %bb.8(0x04000000), %bb.14(0x7c000000)
+  ; CHECK-NEXT:   liveins: $x0, $x8, $x9, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w11 = LDRBBui renamable $x10, 0
+  ; CHECK-NEXT:   TBZW killed renamable $w11, 3, %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14.while.body:
+  ; CHECK-NEXT:   successors: %bb.13(0x7c000000), %bb.15(0x04000000)
+  ; CHECK-NEXT:   liveins: $x0, $x8, $x9, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w11 = SUBWri renamable $w8, 1, 0
+  ; CHECK-NEXT:   renamable $w11 = LDRWroW renamable $x0, killed renamable $w11, 0, 1
+  ; CHECK-NEXT:   early-clobber renamable $x8 = STRWpost renamable $w11, renamable $x8, 2
+  ; CHECK-NEXT:   TBZW killed renamable $w11, 2, %bb.13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.15.if.then:
+  ; CHECK-NEXT:   successors: %bb.16(0x7e000000), %bb.8(0x02000000)
+  ; CHECK-NEXT:   liveins: $x0, $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $w8 = LDRWui renamable $x0, 4
+  ; CHECK-NEXT:   STRWroX killed renamable $w8, renamable $x0, renamable $x9, 0, 1
+  ; CHECK-NEXT:   renamable $w8 = MOVZWi 3, 0, implicit-def $x8
+  ; CHECK-NEXT:   dead $xzr = SUBSXri renamable $x9, 298, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 8, %bb.8, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.16.if.then:
+  ; CHECK-NEXT:   successors: %bb.14(0x7df7df7e), %bb.8(0x02082082)
+  ; CHECK-NEXT:   liveins: $x0, $x8, $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x9 = nuw nsw ADDXri killed renamable $x9, 1, 0
+  ; CHECK-NEXT:   renamable $x10 = nuw ADDXrs renamable $x0, renamable $x9, 2
+  ; CHECK-NEXT:   renamable $w11 = LDRWui renamable $x10, 0
+  ; CHECK-NEXT:   TBZW killed renamable $w11, 3, %bb.8
+  ; CHECK-NEXT:   B %bb.14
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.5(0x40000000)
+    liveins: $x0
+
+    renamable $w8 = LDRBBui renamable $x0, 4
+    TBNZW killed renamable $w8, 3, %bb.5
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $x0
+
+    renamable $w11 = MOVZWi 1, 0
+    renamable $w9 = MOVZWi 1, 0
+    renamable $w8 = MOVZWi 2, 0, implicit-def $x8
+
+  bb.2.while.cond.preheader:
+    successors: %bb.3(0x60000000), %bb.8(0x20000000)
+    liveins: $w9, $w11, $x0, $x8
+
+    dead $wzr = SUBSWri renamable $w11, 299, 0, implicit-def $nzcv
+    Bcc 8, %bb.8, implicit $nzcv
+
+  bb.3.while.cond.preheader:
+    successors: %bb.4(0x55555555), %bb.8(0x2aaaaaab)
+    liveins: $w9, $x0, $x8
+
+    renamable $x10 = nuw ADDXrx renamable $x0, renamable $w9, 18
+    renamable $w11 = LDRWui renamable $x10, 0
+    TBZW killed renamable $w11, 3, %bb.8
+
+  bb.4:
+    successors: %bb.14(0x80000000)
+    liveins: $w9, $x0, $x8, $x10
+
+    renamable $w9 = ORRWrs $wzr, killed renamable $w9, 0, implicit-def $x9
+    B %bb.14
+
+  bb.5.for.body.preheader:
+    successors: %bb.6(0x80000000)
+    liveins: $x0
+
+    renamable $x10 = ADDXri renamable $x0, 8, 0
+    renamable $w9 = MOVZWi 1, 0
+    renamable $w12 = MOVZWi 2, 0, implicit-def $x12
+    $x11 = ORRXrs $xzr, $x10, 0
+
+  bb.6.for.body:
+    successors: %bb.7(0x7e000000), %bb.2(0x02000000)
+    liveins: $w9, $x0, $x10, $x11, $x12
+
+    renamable $w8 = LDURWi renamable $x11, -4
+    STRWui killed renamable $w8, renamable $x11, 0
+    renamable $x8 = nuw nsw ADDXri renamable $x12, 2, 0
+    renamable $w9 = nuw nsw ADDWri killed renamable $w9, 1, 0
+    early-clobber renamable $x10, renamable $w13 = LDRBBpost killed renamable $x10, 4
+    TBZW killed renamable $w13, 3, %bb.2
+
+  bb.7.for.body:
+    successors: %bb.6(0x7df7df7e), %bb.2(0x02082082)
+    liveins: $w9, $x0, $x8, $x10, $x11, $x12
+
+    renamable $x11 = ADDXri killed renamable $x11, 8, 0
+    dead $xzr = SUBSXri killed renamable $x12, 598, 0, implicit-def $nzcv
+    $x12 = ORRXrs $xzr, $x8, 0
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.6
+
+  bb.8.for.cond28.preheader:
+    successors: %bb.12(0x40000000), %bb.9(0x40000000)
+    liveins: $x0, $x8
+
+    renamable $w9 = LDRBBui renamable $x0, 4
+    TBZW killed renamable $w9, 3, %bb.12
+
+  bb.9.for.body36.preheader:
+    successors: %bb.10(0x80000000)
+    liveins: $x0, $x8
+
+    renamable $w9 = MOVZWi 8, 0, implicit-def $x9
+
+  bb.10.for.body36:
+    successors: %bb.11(0x7e000000), %bb.12(0x02000000)
+    liveins: $x0, $x8, $x9
+
+    renamable $w10 = SUBWri renamable $w8, 1, 0
+    renamable $w10 = LDRWroW renamable $x0, killed renamable $w10, 0, 1
+    STRWui killed renamable $w10, renamable $x8, 0
+    renamable $w10 = LDRBBroX renamable $x0, renamable $x9, 0, 0
+    TBZW killed renamable $w10, 3, %bb.12
+
+  bb.11.for.body36:
+    successors: %bb.10(0x7df7df7e), %bb.12(0x02082082)
+    liveins: $x0, $x8, $x9
+
+    renamable $x8 = ADDXri renamable $x8, 2, 0, implicit killed $x8, implicit-def $x8
+    dead $xzr = SUBSXri renamable $x9, 1200, 0, implicit-def $nzcv
+    renamable $x9 = nuw nsw ADDXri killed renamable $x9, 4, 0
+    Bcc 1, %bb.10, implicit $nzcv
+
+  bb.12.for.end45:
+    RET undef $lr
+
+  bb.13.if.end:
+    successors: %bb.8(0x04000000), %bb.14(0x7c000000)
+    liveins: $x0, $x8, $x9, $x10
+
+    renamable $x8 = ADDXri renamable $x8, 2, 0, implicit killed $x8, implicit-def $x8
+    renamable $w11 = LDRBBui renamable $x10, 0
+    TBZW killed renamable $w11, 3, %bb.8
+
+  bb.14.while.body:
+    successors: %bb.13(0x7c000000), %bb.15(0x04000000)
+    liveins: $x0, $x8, $x9, $x10
+
+    renamable $w11 = SUBWri renamable $w8, 1, 0
+    renamable $w11 = LDRWroW renamable $x0, killed renamable $w11, 0, 1
+    STRWui renamable $w11, renamable $x8, 0
+    TBZW killed renamable $w11, 2, %bb.13
+
+  bb.15.if.then:
+    successors: %bb.16(0x7e000000), %bb.8(0x02000000)
+    liveins: $x0, $x9
+
+    renamable $w8 = LDRWui renamable $x0, 4
+    STRWroX killed renamable $w8, renamable $x0, renamable $x9, 0, 1
+    renamable $w8 = MOVZWi 3, 0, implicit-def $x8
+    dead $xzr = SUBSXri renamable $x9, 298, 0, implicit-def $nzcv
+    Bcc 8, %bb.8, implicit $nzcv
+
+  bb.16.if.then:
+    successors: %bb.14(0x7df7df7e), %bb.8(0x02082082)
+    liveins: $x0, $x8, $x9
+
+    renamable $x9 = nuw nsw ADDXri killed renamable $x9, 1, 0
+    renamable $x10 = nuw ADDXrs renamable $x0, renamable $x9, 2
+    renamable $w11 = LDRWui renamable $x10, 0
+    TBZW killed renamable $w11, 3, %bb.8
+    B %bb.14
+...
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir b/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
index dc75c8c61c53c..c944889ede695 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
@@ -14,8 +14,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK-NEXT: [[MOVZWi:%[0-9]+]]:gpr32common = nsw MOVZWi 79, 0
-    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVZWi]]
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm 79
+    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
     ; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:gpr32 = COPY $w0
@@ -38,8 +38,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[MOVZXi:%[0-9]+]]:gpr64common = nsw MOVZXi 79, 0
-    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVZXi]]
+    ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm 79
+    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
     ; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:gpr64 = COPY $x0
@@ -62,8 +62,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK-NEXT: [[MOVNWi:%[0-9]+]]:gpr32common = nsw MOVNWi 0, 0
-    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32 = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVNWi]]
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm -1
+    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32 = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
     ; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:gpr32 = COPY $w0
@@ -86,8 +86,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[MOVNXi:%[0-9]+]]:gpr64common = nsw MOVNXi 0, 0
-    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVNXi]]
+    ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm -1
+    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
     ; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:gpr64 = COPY $x0
@@ -110,8 +110,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK-NEXT: [[ORRWri:%[0-9]+]]:gpr32common = nsw ORRWri $wzr, 1291
-    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[ORRWri]]
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm 16773120
+    ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
     ; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:gpr32 = COPY $w0
@@ -134,8 +134,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ORRXri:%[0-9]+]]:gpr64common = nsw ORRXri $xzr, 7435
-    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[ORRXri]]
+    ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm 16773120
+    ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
     ; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:gpr64 = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/madd-combiner.ll b/llvm/test/CodeGen/AArch64/madd-combiner.ll
index 6e510712fbd21..cc7fc8fc98629 100644
--- a/llvm/test/CodeGen/AArch64/madd-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/madd-combiner.ll
@@ -39,9 +39,8 @@ define void @mul_add_imm2() {
 ; CHECK-FAST-LABEL: mul_add_imm2:
 ; CHECK-FAST:       ; %bb.0: ; %entry
 ; CHECK-FAST-NEXT:    mov x8, #-3 ; =0xfffffffffffffffd
-; CHECK-FAST-NEXT:    mov x9, #-3 ; =0xfffffffffffffffd
-; CHECK-FAST-NEXT:    madd x8, x8, x8, x9
 ; CHECK-FAST-NEXT:    mov x9, #45968 ; =0xb390
+; CHECK-FAST-NEXT:    madd x8, x8, x8, x8
 ; CHECK-FAST-NEXT:    movk x9, #48484, lsl #16
 ; CHECK-FAST-NEXT:    movk x9, #323, lsl #32
 ; CHECK-FAST-NEXT:  LBB2_1: ; %for.body8
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 8c1d41f71c1ec..5c58eab391972 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 8 x bfloat> %acc, %mul
@@ -17,8 +17,8 @@ define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_nxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 4 x bfloat> %acc, %mul
@@ -28,8 +28,8 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
 define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_nxv2bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fadd contract <vscale x 2 x bfloat> %acc, %mul
@@ -39,8 +39,8 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
 define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 8 x bfloat> %acc, %mul
@@ -50,8 +50,8 @@ define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale
 define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_nxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 4 x bfloat> %acc, %mul
@@ -61,8 +61,8 @@ define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale
 define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_nxv2bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %res = fsub contract <vscale x 2 x bfloat> %acc, %mul
@@ -72,9 +72,7 @@ define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale
 define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_sel_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 8 x bfloat> %acc, %mul
@@ -85,9 +83,7 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_sel_nxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 4 x bfloat> %acc, %mul
@@ -98,9 +94,7 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; CHECK-LABEL: fmla_sel_nxv2bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    bfmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %add = fadd contract <vscale x 2 x bfloat> %acc, %mul
@@ -111,9 +105,7 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
 define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_sel_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 8 x bfloat> %acc, %mul
@@ -124,9 +116,7 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
 define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_sel_nxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 4 x bfloat> %acc, %mul
@@ -137,9 +127,7 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
 define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
 ; CHECK-LABEL: fmls_sel_nxv2bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bfmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    bfmls z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
   %sub = fsub contract <vscale x 2 x bfloat> %acc, %mul
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 584c29ebcfc04..1b6b92af8c64a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -16,19 +16,16 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
@@ -40,19 +37,16 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.s, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, #0x80000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.s
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f32.nxv4i32(<vscale x 4 x float> %f)
@@ -62,39 +56,26 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
+; CHECK-NEXT:    mov z3.s, #0x80000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
+; CHECK-NEXT:    mov z4.s, #0x80000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.s, p2/m, z1.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f32.nxv8i32(<vscale x 8 x float> %f)
     ret <vscale x 8 x i32> %x
@@ -105,19 +86,17 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
-; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.s
+; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
@@ -127,40 +106,28 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
+; CHECK-NEXT:    mov z3.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
+; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzs z3.s, p0/m, z1.s
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
+; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    fcvtzs z2.s, p2/m, z0.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z4.s, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p3, z5.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z5.s, z4.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z5.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z5.s, z2.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -171,19 +138,16 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f32.nxv2i64(<vscale x 2 x float> %f)
@@ -193,41 +157,28 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z5.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
     ret <vscale x 4 x i64> %x
@@ -248,20 +199,17 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
@@ -271,41 +219,28 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z6.d, #0x7fffffff
+; CHECK-NEXT:    mov z3.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
+; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -316,7 +251,6 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -327,48 +261,38 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    mov z5.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
-; CHECK-NEXT:    mov z26.d, #0x7fffffff
+; CHECK-NEXT:    mov z6.d, #0xffffffff80000000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
-; CHECK-NEXT:    movprfx z25, z2
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
+; CHECK-NEXT:    mov z7.d, #0xffffffff80000000
+; CHECK-NEXT:    mov z24.d, #0xffffffff80000000
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z2.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z2.s
@@ -382,40 +306,28 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z3.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z2.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcvtzs z2.d, p2/m, z0.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z4.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.d, p3, z5.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z5.d, z4.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z5.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z5.d, z2.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -426,7 +338,6 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -434,50 +345,41 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z5.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    mov z6.d, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
+; CHECK-NEXT:    mov z7.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    movprfx z4, z3
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z5.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z6.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    mov z7.d, p3/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z24.d, p4/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    mov z4.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z3.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z2.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcvtzs z4.d, p4/m, z0.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p1, z25.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z2.d, p5, z25.d, z4.d
-; CHECK-NEXT:    sel z0.d, p6, z25.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z1.d, p7, z25.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z25.d, z24.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z6.d
+; CHECK-NEXT:    sel z1.d, p3, z25.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z25.d, z4.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z2.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
@@ -494,19 +396,16 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f64.nxv2i64(<vscale x 2 x double> %f)
@@ -516,39 +415,26 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z5.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f64.nxv4i64(<vscale x 4 x double> %f)
     ret <vscale x 4 x i64> %x
@@ -570,19 +456,16 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
@@ -594,19 +477,16 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.s, #0x7fffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.s, #0x80000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    fcvtzs z1.s, p1/m, z0.h
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f16.nxv4i32(<vscale x 4 x half> %f)
@@ -616,41 +496,28 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
+; CHECK-NEXT:    mov z3.s, #0x80000000
+; CHECK-NEXT:    mov z4.s, #0x80000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x80000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.s, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.s, p2/m, z0.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p1, z2.s, z3.s
+; CHECK-NEXT:    sel z1.s, p3, z2.s, z4.s
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
     ret <vscale x 8 x i32> %x
@@ -661,18 +528,16 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z2.s, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.s, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    fcvtzs z2.s, p1/m, z0.h
+; CHECK-NEXT:    sel z0.s, p2, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
@@ -684,18 +549,16 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z2.h, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.h, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z1.h, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.h, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.h, p2, z2.h, z1.h
+; CHECK-NEXT:    fcvtzs z2.h, p1/m, z0.h
+; CHECK-NEXT:    sel z0.h, p2, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f16.nxv8i16(<vscale x 8 x half> %f)
@@ -707,19 +570,16 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    sel z0.d, p2, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
@@ -729,41 +589,28 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z0.h, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p3, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index ed352ffec339f..b3aefb8460985 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -16,15 +16,13 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
-; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.s
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
     ret <vscale x 2 x i32> %x
@@ -35,13 +33,11 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.s
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -53,21 +49,17 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z2.s, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z3.s, p2/m, z1.s
+; CHECK-NEXT:    mov z2.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -81,16 +73,14 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.s
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
     ret <vscale x 4 x i16> %x
@@ -101,24 +91,20 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.s, p3, z0.s, z3.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z4.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcvtzu z2.s, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcvtzu z3.s, p2/m, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -129,13 +115,11 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.s
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -150,20 +134,16 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z4.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z4.s
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.s
+; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
@@ -185,16 +165,14 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
     ret <vscale x 2 x i32> %x
@@ -205,24 +183,20 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    mov z1.d, #0xffffffff
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -233,47 +207,35 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z1.d
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z3
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzu z24.d, p0/m, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcvtzu z4.d, p1/m, z1.d
+; CHECK-NEXT:    fcvtzu z5.d, p2/m, z0.d
+; CHECK-NEXT:    fcvtzu z6.d, p3/m, z3.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z24.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p5, z0.d, z5.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p6, z0.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z0.d, z7.d
+; CHECK-NEXT:    fcvtzu z7.d, p4/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z3.d, z24.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z4.d, p0, z0.d, z24.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    sel z1.d, p1, z0.d, z4.d
+; CHECK-NEXT:    sel z2.d, p2, z0.d, z5.d
+; CHECK-NEXT:    sel z3.d, p3, z0.d, z6.d
+; CHECK-NEXT:    sel z4.d, p0, z0.d, z7.d
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.s, z4.s, z3.s
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -288,24 +250,20 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z0.d, #65535 // =0xffff
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -316,47 +274,35 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    movprfx z5, z3
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z3.d
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzu z24.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z24.d, x8
+; CHECK-NEXT:    fcvtzu z4.d, p1/m, z3.d
+; CHECK-NEXT:    fcvtzu z5.d, p2/m, z2.d
+; CHECK-NEXT:    fcvtzu z6.d, p3/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z24.d
 ; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p5, z2.d, z5.d
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z1.d, p6, z2.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z7.d
+; CHECK-NEXT:    fcvtzu z7.d, p4/m, z0.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z24.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p0, z2.d, z24.d
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z4.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z3.d, p3, z2.d, z6.d
+; CHECK-NEXT:    sel z2.d, p0, z2.d, z7.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
@@ -372,13 +318,11 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.d
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -390,21 +334,17 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z3.d, p2/m, z1.d
+; CHECK-NEXT:    mov z2.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -429,15 +369,13 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.h
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
     ret <vscale x 2 x i32> %x
@@ -448,13 +386,11 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -469,20 +405,16 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z3.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.s, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.s, p2/m, z3.h
+; CHECK-NEXT:    mov z0.s, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
@@ -494,15 +426,13 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcvtzu z1.s, p1/m, z0.h
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
     ret <vscale x 4 x i16> %x
@@ -513,13 +443,11 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.h, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.h, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.h, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -532,13 +460,11 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcvtzu z1.d, p1/m, z0.h
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -553,20 +479,16 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    fcvtzu z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z4.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    fcvtzu z1.d, p2/m, z3.h
+; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
diff --git a/llvm/test/CodeGen/AArch64/sve-llrint.ll b/llvm/test/CodeGen/AArch64/sve-llrint.ll
index 16e0e0c4661b6..b0198cf9d1247 100644
--- a/llvm/test/CodeGen/AArch64/sve-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-llrint.ll
@@ -6,20 +6,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f16(<vscale x 1 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f16(<vscale x 1 x half> %x)
@@ -32,20 +29,17 @@ define <vscale x 2 x i64> @llrint_v1i64_v2f16(<vscale x 2 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half> %x)
@@ -56,43 +50,30 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z5.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x i64> %a
@@ -104,7 +85,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -116,8 +96,10 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z6.h, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.h, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
@@ -132,41 +114,29 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z25, z5
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z0.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z25.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z5.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    mov z0.d, p1/m, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -180,7 +150,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-LABEL: llrint_v16i64_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -188,124 +158,110 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z25.d, #0x8000000000000000
-; CHECK-NEXT:    mov z27.h, w8
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    uunpklo z24.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z3.s
+; CHECK-NEXT:    uunpklo z25.d, z1.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
-; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z5.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.h, p0/m, z0.h
+; CHECK-NEXT:    uunpkhi z0.d, z1.s
 ; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
-; CHECK-NEXT:    movprfx z28, z0
-; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
-; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
-; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
-; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z26
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
-; CHECK-NEXT:    mov z1.d, p5/m, z25.d
-; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p2/m, z25.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    movprfx z2, z28
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
-; CHECK-NEXT:    movprfx z5, z29
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
-; CHECK-NEXT:    not p7.b, p0/z, p7.b
-; CHECK-NEXT:    mov z3.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
-; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
-; CHECK-NEXT:    not p7.b, p0/z, p9.b
-; CHECK-NEXT:    mov z4.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
-; CHECK-NEXT:    mov z5.d, p5/m, z25.d
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
-; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
-; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
-; CHECK-NEXT:    mov z4.d, p5/m, z7.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
-; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
-; CHECK-NEXT:    mov z5.d, p3/m, z7.d
-; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    movprfx z29, z3
+; CHECK-NEXT:    frintx z29.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z4.h, z24.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z2.h, z24.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z24.h
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    frintx z9.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z26.h, z24.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z6.h, z24.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z25.h, z24.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z29.h, z24.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z1.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z25.h, z1.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcvtzs z7.d, p1/m, z4.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z1.h
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z9.h, z24.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p4/m, z26.h
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.h
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z25.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z1.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z26.h, z1.h
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z29.h
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z7.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z29.h, z1.h
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z1.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z30.d
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z27.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z28.d
+; CHECK-NEXT:    mov z3.d, p7/m, z10.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
-; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f16(<vscale x 16 x half> %x)
@@ -318,6 +274,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -340,8 +298,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -352,230 +310,191 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    uunpklo z9.s, z2.h
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    mov z26.h, w9
-; CHECK-NEXT:    uunpkhi z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z12.s, z3.h
+; CHECK-NEXT:    mov z27.h, w9
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z31.s, z2.h
-; CHECK-NEXT:    uunpkhi z12.s, z2.h
-; CHECK-NEXT:    mov z17.d, z3.d
-; CHECK-NEXT:    uunpklo z0.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z5.s
-; CHECK-NEXT:    uunpkhi z24.d, z5.s
-; CHECK-NEXT:    uunpklo z28.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z6.s
-; CHECK-NEXT:    uunpklo z8.d, z25.s
-; CHECK-NEXT:    uunpkhi z9.d, z25.s
-; CHECK-NEXT:    uunpklo z16.s, z17.h
-; CHECK-NEXT:    uunpklo z11.d, z31.s
-; CHECK-NEXT:    uunpkhi z14.d, z31.s
-; CHECK-NEXT:    uunpkhi z17.s, z17.h
-; CHECK-NEXT:    movprfx z30, z4
-; CHECK-NEXT:    frintx z30.h, p0/m, z4.h
-; CHECK-NEXT:    movprfx z4, z7
-; CHECK-NEXT:    frintx z4.h, p0/m, z7.h
-; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z7, z28
-; CHECK-NEXT:    frintx z7.h, p0/m, z28.h
-; CHECK-NEXT:    movprfx z25, z29
-; CHECK-NEXT:    frintx z25.h, p0/m, z29.h
-; CHECK-NEXT:    movprfx z3, z9
-; CHECK-NEXT:    frintx z3.h, p0/m, z9.h
-; CHECK-NEXT:    mov z5.h, w9
-; CHECK-NEXT:    movprfx z31, z11
-; CHECK-NEXT:    frintx z31.h, p0/m, z11.h
-; CHECK-NEXT:    movprfx z9, z14
-; CHECK-NEXT:    frintx z9.h, p0/m, z14.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z26.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z26.h
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z30.h, z26.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z4.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z26.h
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.h
-; CHECK-NEXT:    movprfx z10, z6
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z6.h
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p3.h, p0/z, z7.h, z26.h
-; CHECK-NEXT:    movprfx z13, z7
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z7.h
-; CHECK-NEXT:    movprfx z15, z25
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
-; CHECK-NEXT:    not p5.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z18, z3
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z31.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z21, z9
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z9.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z27.d, z24.d
-; CHECK-NEXT:    not p7.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z4.h, z5.h
-; CHECK-NEXT:    mov z29.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z26.h
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z9.h, z26.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p7, z27.d, z28.d
-; CHECK-NEXT:    movprfx z28, z8
-; CHECK-NEXT:    frintx z28.h, p0/m, z8.h
-; CHECK-NEXT:    sel z8.d, p5, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z12.s
-; CHECK-NEXT:    uunpkhi z12.d, z12.s
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    sel z11.d, p3, z27.d, z13.d
-; CHECK-NEXT:    uunpklo z13.d, z16.s
-; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z26.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z24.d, p5, z27.d, z15.d
-; CHECK-NEXT:    uunpkhi z15.d, z16.s
-; CHECK-NEXT:    movprfx z14, z28
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z28.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    frintx z12.h, p0/m, z12.h
-; CHECK-NEXT:    uunpkhi z17.d, z17.s
-; CHECK-NEXT:    movprfx z19, z13
-; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z31.h, z26.h
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z10.h, z26.h
+; CHECK-NEXT:    uunpkhi z14.s, z2.h
+; CHECK-NEXT:    uunpklo z15.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z4.s
+; CHECK-NEXT:    uunpkhi z6.d, z4.s
+; CHECK-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEXT:    uunpkhi z26.d, z0.s
+; CHECK-NEXT:    uunpklo z8.d, z10.s
+; CHECK-NEXT:    uunpkhi z11.d, z10.s
+; CHECK-NEXT:    uunpklo z10.d, z9.s
+; CHECK-NEXT:    uunpkhi z13.d, z9.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z16.d, z12.s
+; CHECK-NEXT:    uunpklo z18.d, z14.s
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z4, z5
+; CHECK-NEXT:    frintx z4.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z5, z6
+; CHECK-NEXT:    frintx z5.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z7, z29
+; CHECK-NEXT:    frintx z7.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z26
+; CHECK-NEXT:    frintx z6.h, p0/m, z26.h
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.h, p0/m, z11.h
+; CHECK-NEXT:    movprfx z3, z10
+; CHECK-NEXT:    frintx z3.h, p0/m, z10.h
+; CHECK-NEXT:    movprfx z10, z13
+; CHECK-NEXT:    frintx z10.h, p0/m, z13.h
+; CHECK-NEXT:    uunpkhi z26.d, z25.s
+; CHECK-NEXT:    uunpkhi z13.d, z12.s
+; CHECK-NEXT:    frintx z8.h, p0/m, z8.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z1.h, z27.h
+; CHECK-NEXT:    uunpkhi z14.d, z14.s
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z7.h, z27.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z27.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z5.h, z27.h
+; CHECK-NEXT:    uunpklo z19.d, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z15.s
+; CHECK-NEXT:    movprfx z20, z13
+; CHECK-NEXT:    frintx z20.h, p0/m, z13.h
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcvtzs z29.d, p3/m, z1.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z27.h
+; CHECK-NEXT:    mov z11.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z7.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z27.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z12.h, z26.h
-; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z13.d, p3, z27.d, z18.d
-; CHECK-NEXT:    fcmge p3.h, p0/z, z19.h, z26.h
-; CHECK-NEXT:    movprfx z0, z15
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z15.h
-; CHECK-NEXT:    sel z22.d, p4, z27.d, z14.d
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z21.d
-; CHECK-NEXT:    movprfx z21, z12
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z12.h
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.h
-; CHECK-NEXT:    sel z14.d, p5, z27.d, z20.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z15.h, z26.h
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.h
-; CHECK-NEXT:    movprfx z2, z17
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z17.h
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z26.h
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.h, p0/z, z17.h, z26.h
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    mov z21.d, p7/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z16.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    mov z29.d, p2/m, z26.d
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    ldr z27, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z12.h, z5.h
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z17.h, z17.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    mov z1.d, p4/m, z26.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    mov z8.d, p9/m, z26.d
-; CHECK-NEXT:    mov z27.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z15.h, z5.h
-; CHECK-NEXT:    mov z2.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z5.h
-; CHECK-NEXT:    mov z11.d, p6/m, z26.d
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z5.h
-; CHECK-NEXT:    sel z15.d, p2, z26.d, z21.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z12.h, z12.h
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p7, z26.d, z22.d
-; CHECK-NEXT:    mov z0.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z10.h, z5.h
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z17.d, p3, z26.d, z23.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z26.d, z18.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z9.h, z9.h
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z26.d, z20.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z31.h, z5.h
-; CHECK-NEXT:    mov z17.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z31.h, z31.h
+; CHECK-NEXT:    frintx z14.h, p0/m, z14.h
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z4.h
+; CHECK-NEXT:    fcvtzs z28.d, p5/m, z5.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z10.h, z27.h
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.h, p0/m, z19.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z30.d, p1/m, z6.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z11.d, p3/m, z9.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z20.h, z27.h
+; CHECK-NEXT:    mov z25.h, w9
+; CHECK-NEXT:    fcvtzs z17.d, p2/m, z8.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z27.h
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z14.h, z27.h
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z3.h
+; CHECK-NEXT:    fcvtzs z13.d, p5/m, z10.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z19.h, z27.h
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z15.h, z27.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z26.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z18.h, z27.h
+; CHECK-NEXT:    fcvtzs z24.d, p3/m, z20.h
+; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z20.h, z25.h
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z16.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z16.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z14.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z8.h, z25.h
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z19.h
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z15.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z15.h, z25.h
+; CHECK-NEXT:    mov z24.d, p11/m, z27.d
+; CHECK-NEXT:    sel z20.d, p3, z27.d, z21.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z25.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    mov z17.d, p2/m, z27.d
+; CHECK-NEXT:    fcvtzs z16.d, p1/m, z18.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z18.h, z25.h
+; CHECK-NEXT:    mov z28.d, p7/m, z27.d
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z14.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z15.h, z15.h
+; CHECK-NEXT:    mov z0.d, p5/m, z27.d
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z10.h, z25.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z19.h, z19.h
+; CHECK-NEXT:    sel z19.d, p3, z27.d, z23.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z20.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z25.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z7.h, z25.h
+; CHECK-NEXT:    str z24, [x8, #15, mul vl]
+; CHECK-NEXT:    sel z24.d, p2, z27.d, z16.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    sel z15.d, p7, z27.d, z22.d
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z13.d, p5/m, z27.d
+; CHECK-NEXT:    str z20, [x8, #14, mul vl]
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    mov z19.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z10.h, z10.h
+; CHECK-NEXT:    mov z29.d, p8/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z25.h, z25.h
-; CHECK-NEXT:    str z17, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z5.h
+; CHECK-NEXT:    mov z15.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    sel z0.d, p2, z27.d, z12.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    mov z30.d, p9/m, z27.d
+; CHECK-NEXT:    str z19, [x8, #12, mul vl]
+; CHECK-NEXT:    sel z3.d, p5, z27.d, z11.d
+; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    sel z0.d, p1, z26.d, z14.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p4, z26.d, z13.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z1.d, p3, z26.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z16.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z26.h, z25.h
+; CHECK-NEXT:    mov z13.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    str z24, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z4.h, z25.h
+; CHECK-NEXT:    str z13, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z31.d, p10/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    str z3, [x8, #7, mul vl]
-; CHECK-NEXT:    ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z16, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z26.d
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z17.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    str z17, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, z27.d
+; CHECK-NEXT:    mov z29.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z2, [x8, #5, mul vl]
+; CHECK-NEXT:    str z31, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z29, [x8, #2, mul vl]
+; CHECK-NEXT:    str z28, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -592,6 +511,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -611,20 +532,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f32(<vscale x 1 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f32(<vscale x 1 x float> %x)
@@ -637,20 +555,17 @@ define <vscale x 2 x i64> @llrint_v2i64_v2f32(<vscale x 2 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x)
@@ -661,43 +576,30 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f32(<vscale x 2 x float>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f32(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z5.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x i64> %a
@@ -709,7 +611,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f32(<vscale x 8 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -718,57 +619,47 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.s, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
-; CHECK-NEXT:    movprfx z25, z1
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z25.s
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z2.s
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z25.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z25.s
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z3.s, z25.s
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z1.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z3.s, z3.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -782,7 +673,7 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-LABEL: llrint_v16i64_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -790,119 +681,106 @@ define <vscale x 16 x i64> @llrint_v16i64_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z4.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
-; CHECK-NEXT:    movprfx z28, z1
-; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z7.d, z3.s
+; CHECK-NEXT:    mov z24.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
-; CHECK-NEXT:    movprfx z29, z2
-; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
-; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z30, z3
-; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
-; CHECK-NEXT:    mov z27.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
-; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
-; CHECK-NEXT:    movprfx z31, z25
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
-; CHECK-NEXT:    mov z1.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
-; CHECK-NEXT:    movprfx z4, z29
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
-; CHECK-NEXT:    mov z2.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z3.d, p5/m, z0.d
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    mov z5.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
-; CHECK-NEXT:    mov z6.d, p4/m, z0.d
-; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
-; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
-; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
-; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
-; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
-; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z4.s, p0/m, z4.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z30.s, w8
+; CHECK-NEXT:    movprfx z27, z2
+; CHECK-NEXT:    frintx z27.s, p0/m, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z3.s
+; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    frintx z25.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z5.s, p0/m, z5.s
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z4.s, z24.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z24.s
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z2
+; CHECK-NEXT:    frintx z9.s, p0/m, z2.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z6.s, z24.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z5.s, z24.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z24.s
+; CHECK-NEXT:    fcmge p7.s, p0/z, z7.s, z24.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z27.s, z24.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z4.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z30.s
+; CHECK-NEXT:    fcvtzs z26.d, p2/m, z0.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z24.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z6.s
+; CHECK-NEXT:    fcvtzs z3.d, p3/m, z5.s
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z7.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z30.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z5.s, z30.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z25.s, z30.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z6.s, z30.s
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z27.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z30.s
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z7.s, z30.s
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z30.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z4.s, z4.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z26.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z3.d
+; CHECK-NEXT:    sel z3.d, p7, z10.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z29.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f32(<vscale x 16 x float> %x)
@@ -915,6 +793,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -937,8 +817,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -949,224 +829,185 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z24.d, z0.s
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
 ; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z25.d, z0.s
-; CHECK-NEXT:    uunpkhi z28.d, z1.s
-; CHECK-NEXT:    mov z29.s, w9
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEXT:    uunpkhi z9.d, z2.s
+; CHECK-NEXT:    uunpklo z11.d, z3.s
+; CHECK-NEXT:    uunpkhi z12.d, z3.s
+; CHECK-NEXT:    mov z10.s, w9
 ; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z30.d, z2.s
-; CHECK-NEXT:    uunpklo z8.d, z3.s
 ; CHECK-NEXT:    movprfx z0, z24
 ; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
-; CHECK-NEXT:    uunpkhi z9.d, z3.s
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.s, p0/m, z25.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.s, p0/m, z26.s
+; CHECK-NEXT:    movprfx z26, z1
+; CHECK-NEXT:    frintx z26.s, p0/m, z1.s
 ; CHECK-NEXT:    uunpkhi z14.d, z4.s
-; CHECK-NEXT:    movprfx z24, z26
-; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z1, z25
-; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    frintx z5.s, p0/m, z28.s
-; CHECK-NEXT:    uunpklo z26.d, z2.s
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    mov z25.s, w9
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    frintx z28.s, p0/m, z30.s
-; CHECK-NEXT:    movprfx z30, z8
-; CHECK-NEXT:    frintx z30.s, p0/m, z8.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
-; CHECK-NEXT:    movprfx z31, z0
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z0.s
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z29.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    frintx z26.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z10, z1
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z11, z24
-; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
-; CHECK-NEXT:    movprfx z12, z5
-; CHECK-NEXT:    fcvtzs z12.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z15, z28
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.s
-; CHECK-NEXT:    str z1, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z25.s
-; CHECK-NEXT:    fcmgt p9.s, p0/z, z5.s, z25.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z31.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z26.s, z29.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z13, z26
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z26.s
-; CHECK-NEXT:    sel z31.d, p2, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z4.s
-; CHECK-NEXT:    sel z8.d, p3, z27.d, z11.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z28.s, z29.s
-; CHECK-NEXT:    sel z11.d, p5, z27.d, z12.d
-; CHECK-NEXT:    movprfx z4, z9
-; CHECK-NEXT:    frintx z4.s, p0/m, z9.s
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    fcmge p4.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z25.s
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z13.d
-; CHECK-NEXT:    uunpkhi z13.d, z17.s
-; CHECK-NEXT:    movprfx z9, z10
-; CHECK-NEXT:    frintx z9.s, p0/m, z10.s
-; CHECK-NEXT:    movprfx z10, z14
-; CHECK-NEXT:    frintx z10.s, p0/m, z14.s
-; CHECK-NEXT:    uunpkhi z17.d, z6.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    uunpklo z14.d, z6.s
-; CHECK-NEXT:    movprfx z6, z16
-; CHECK-NEXT:    frintx z6.s, p0/m, z16.s
-; CHECK-NEXT:    uunpklo z16.d, z7.s
+; CHECK-NEXT:    movprfx z2, z27
+; CHECK-NEXT:    frintx z2.s, p0/m, z27.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z27, z9
+; CHECK-NEXT:    frintx z27.s, p0/m, z9.s
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.s, p0/m, z11.s
+; CHECK-NEXT:    movprfx z11, z12
+; CHECK-NEXT:    frintx z11.s, p0/m, z12.s
+; CHECK-NEXT:    uunpklo z15.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    sel z3.d, p3, z27.d, z15.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z4.s, z29.s
-; CHECK-NEXT:    frintx z13.s, p0/m, z13.s
-; CHECK-NEXT:    movprfx z15, z30
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z30.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z9.s, z29.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z10.s, z29.s
-; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
-; CHECK-NEXT:    movprfx z18, z4
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z4.s
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
-; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z19, z14
-; CHECK-NEXT:    frintx z19.s, p0/m, z14.s
-; CHECK-NEXT:    movprfx z14, z9
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z9.s
-; CHECK-NEXT:    fcmge p7.s, p0/z, z6.s, z29.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z13.s, z29.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z26.s, z10.s
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z0.s, z10.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z24.s, z10.s
+; CHECK-NEXT:    movprfx z12, z13
+; CHECK-NEXT:    frintx z12.s, p0/m, z13.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z25.s, z10.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z2.s, z10.s
+; CHECK-NEXT:    movprfx z13, z14
+; CHECK-NEXT:    frintx z13.s, p0/m, z14.s
+; CHECK-NEXT:    uunpklo z17.d, z5.s
+; CHECK-NEXT:    uunpkhi z18.d, z5.s
 ; CHECK-NEXT:    movprfx z21, z7
 ; CHECK-NEXT:    frintx z21.s, p0/m, z7.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z15.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z7.d, p3, z27.d, z18.d
-; CHECK-NEXT:    movprfx z0, z17
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z17.s
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z20.d
-; CHECK-NEXT:    movprfx z20, z6
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z19.s, z29.s
-; CHECK-NEXT:    mov z14.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.s, p0/z, z21.s, z29.s
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.s
-; CHECK-NEXT:    movprfx z22, z13
-; CHECK-NEXT:    fcvtzs z22.d, p0/m, z13.s
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z2, z21
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z21.s
-; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z16.s, z25.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    mov z22.d, p7/m, z27.d
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z21.s, z25.s
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z16.s, z16.s
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    sel z27.d, p1, z29.d, z31.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z25.s
-; CHECK-NEXT:    mov z1.d, p4/m, z29.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z26.s, z25.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z30.s, z25.s
-; CHECK-NEXT:    sel z31.d, p2, z29.d, z8.d
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z13.s, z25.s
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z21.s, z21.s
-; CHECK-NEXT:    mov z2.d, p3/m, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z25.s
-; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z6.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z8.d, p9, z29.d, z11.d
-; CHECK-NEXT:    sel z11.d, p6, z29.d, z12.d
-; CHECK-NEXT:    sel z12.d, p7, z29.d, z15.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z10.s, z25.s
-; CHECK-NEXT:    sel z15.d, p2, z29.d, z22.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z13.s, z13.s
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    uunpklo z19.d, z6.s
+; CHECK-NEXT:    uunpkhi z20.d, z6.s
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z26.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z10.s
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z15.s, p0/m, z15.s
+; CHECK-NEXT:    fcvtzs z1.d, p5/m, z0.s
+; CHECK-NEXT:    fcvtzs z29.d, p1/m, z24.s
+; CHECK-NEXT:    fcvtzs z30.d, p2/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p4/m, z2.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z27.s, z10.s
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z10.s
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z12.s, z10.s
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z13.s, z10.s
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
+; CHECK-NEXT:    frintx z20.s, p0/m, z20.s
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z5.d, p3/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z21.s, z10.s
+; CHECK-NEXT:    mov z3.s, w9
+; CHECK-NEXT:    fcmge p6.s, p0/z, z15.s, z10.s
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z4.d, p1/m, z27.s
+; CHECK-NEXT:    fcvtzs z16.d, p2/m, z9.s
+; CHECK-NEXT:    fcvtzs z6.d, p4/m, z12.s
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z13.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z10.s
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z19.s, z10.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z20.s, z10.s
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z21.s
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z21.s, z3.s
+; CHECK-NEXT:    fcvtzs z22.d, p6/m, z15.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z15.s, z3.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z15.s, z15.s
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z24.s, z3.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
+; CHECK-NEXT:    fcvtzs z10.d, p5/m, z20.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z3.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z3.s
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z19.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z21.s, z21.s
+; CHECK-NEXT:    mov z28.d, p11/m, z7.d
+; CHECK-NEXT:    sel z21.d, p3, z7.d, z22.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z3.s
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    mov z29.d, p7/m, z7.d
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z18.s, z3.s
+; CHECK-NEXT:    mov z16.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z17.s, z3.s
+; CHECK-NEXT:    mov z10.d, p5/m, z7.d
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z19.s, z19.s
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z13.s, z3.s
+; CHECK-NEXT:    mov z21.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    str z28, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z10.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z19.d, p7, z7.d, z23.d
+; CHECK-NEXT:    sel z28.d, p2, z7.d, z15.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z12.s, z3.s
+; CHECK-NEXT:    str z21, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z29.d, z20.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z9.s, z25.s
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z23.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z4.s, z25.s
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z29.d, z18.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z10.s, z10.s
-; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z9.s, z9.s
-; CHECK-NEXT:    sel z0.d, p1, z29.d, z14.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z4.s, z4.s
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z28.s, z25.s
-; CHECK-NEXT:    sel z4.d, p4, z29.d, z7.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    sel z1.d, p3, z29.d, z3.d
-; CHECK-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z14.d, p5/m, z7.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z11.s, z3.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    mov z19.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z3.s
+; CHECK-NEXT:    str z0, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z12.s, z12.s
+; CHECK-NEXT:    sel z0.d, p2, z7.d, z6.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    fcmgt p9.s, p0/z, z26.s, z3.s
+; CHECK-NEXT:    mov z30.d, p8/m, z7.d
+; CHECK-NEXT:    str z19, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    str z28, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z4.d, p1/m, z7.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    ldr z2, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z26.s, z26.s
+; CHECK-NEXT:    mov z16.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z31.d, p9/m, z7.d
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    mov z8.d, p10/m, z7.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z26.s, z26.s
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z12.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z3.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    str z5, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    mov z31.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #6, mul vl]
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z12, [x8, #6, mul vl]
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z3.s, z3.s
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z29.d
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z31, [x8, #2, mul vl]
+; CHECK-NEXT:    mov z30.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z4, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z7.d, z1.d
+; CHECK-NEXT:    str z31, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #4, mul vl]
+; CHECK-NEXT:    str z30, [x8, #2, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    str z29, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1183,6 +1024,8 @@ define <vscale x 32 x i64> @llrint_v32i64_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -1202,20 +1045,17 @@ define <vscale x 1 x i64> @llrint_v1i64_v1f64(<vscale x 1 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.llrint.nxv1i64.nxv1f64(<vscale x 1 x double> %x)
@@ -1228,20 +1068,17 @@ define <vscale x 2 x i64> @llrint_v2i64_v2f64(<vscale x 2 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double> %x)
@@ -1252,41 +1089,28 @@ declare <vscale x 2 x i64> @llvm.llrint.nxv2i64.nxv2f64(<vscale x 2 x double>)
 define <vscale x 4 x i64> @llrint_v4i64_v4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: llrint_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x i64> %a
@@ -1298,7 +1122,6 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f64(<vscale x 8 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -1308,52 +1131,42 @@ define <vscale x 8 x i64> @llrint_v8i64_v8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z25, z3
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1367,7 +1180,7 @@ define <vscale x 16 x i64> @llrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-LABEL: llrint_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -1375,109 +1188,93 @@ define <vscale x 16 x i64> @llrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    movprfx z26, z0
-; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z27, z1
-; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z4
+; CHECK-NEXT:    frintx z25.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z28, z4
-; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    mov z30.d, x8
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
-; CHECK-NEXT:    movprfx z29, z27
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
-; CHECK-NEXT:    movprfx z30, z28
-; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
-; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
-; CHECK-NEXT:    movprfx z26, z2
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z6
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z8, z7
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
-; CHECK-NEXT:    mov z4.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z29.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    movprfx z27, z3
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
-; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
-; CHECK-NEXT:    movprfx z26, z5
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z27.d, p6/m, z0.d
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z25.d, z24.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmge p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p6.d, p0/z, z6.d, z24.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z9.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z25.d, z30.d
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z30.d
+; CHECK-NEXT:    fcvtzs z26.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z30.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z24.d
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z27.d, p3/m, z2.d
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z3.d
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p7/m, z5.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z30.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z30.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z30.d
+; CHECK-NEXT:    fcvtzs z8.d, p6/m, z6.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z9.d, z26.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z30.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z7.d, z30.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z30.d, p4/m, z0.d
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    mov z26.d, p5/m, z0.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
-; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
-; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
-; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    sel z1.d, p4, z9.d, z4.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p5, z9.d, z27.d
+; CHECK-NEXT:    sel z3.d, p7, z9.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z9.d, z29.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z25.d, z25.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p1, z9.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z9.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
-; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z9.d, z24.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x i64> @llvm.llrint.nxv16i64.nxv16f64(<vscale x 16 x double> %x)
@@ -1490,6 +1287,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -1512,8 +1311,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -1526,219 +1325,176 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
 ; CHECK-NEXT:    ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT:    mov z7.d, x9
-; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z3, [x0, #3, mul vl]
+; CHECK-NEXT:    ldr z6, [x0, #4, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z25.d, x9
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z30, z2
-; CHECK-NEXT:    frintx z30.d, p0/m, z2.d
-; CHECK-NEXT:    ldr z6, [x0, #5, mul vl]
-; CHECK-NEXT:    movprfx z25, z24
-; CHECK-NEXT:    frintx z25.d, p0/m, z24.d
-; CHECK-NEXT:    movprfx z12, z1
-; CHECK-NEXT:    frintx z12.d, p0/m, z1.d
-; CHECK-NEXT:    ldr z5, [x0, #4, mul vl]
-; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    frintx z4.d, p0/m, z2.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
-; CHECK-NEXT:    mov z4.d, x9
-; CHECK-NEXT:    fcmge p3.d, p0/z, z0.d, z7.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z30.d, z7.d
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.d
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z12.d, z7.d
-; CHECK-NEXT:    ldr z8, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z27, z12
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z12.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p3.b
-; CHECK-NEXT:    movprfx z31, z3
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z15, z6
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z6.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    movprfx z13, z5
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z5.d
-; CHECK-NEXT:    sel z0.d, p7, z26.d, z24.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z17, z25
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z25.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z25.d, z7.d
-; CHECK-NEXT:    movprfx z22, z9
-; CHECK-NEXT:    frintx z22.d, p0/m, z9.d
-; CHECK-NEXT:    sel z29.d, p4, z26.d, z27.d
-; CHECK-NEXT:    movprfx z27, z8
-; CHECK-NEXT:    frintx z27.d, p0/m, z8.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z12.d, z4.d
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z28.d
-; CHECK-NEXT:    not p4.b, p0/z, p8.b
-; CHECK-NEXT:    ldr z10, [x0, #8, mul vl]
-; CHECK-NEXT:    not p5.b, p0/z, p9.b
-; CHECK-NEXT:    sel z24.d, p3, z26.d, z31.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z2, z22
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z22.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z30.d, z4.d
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    mov x10, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z7.d
-; CHECK-NEXT:    sel z31.d, p5, z26.d, z15.d
-; CHECK-NEXT:    ldr z11, [x0, #9, mul vl]
-; CHECK-NEXT:    movprfx z28, z10
-; CHECK-NEXT:    frintx z28.d, p0/m, z10.d
-; CHECK-NEXT:    ldr z10, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z16, [x0, #13, mul vl]
-; CHECK-NEXT:    ldr z14, [x0, #14, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #12, mul vl]
-; CHECK-NEXT:    mov z17.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p9.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z8, z11
-; CHECK-NEXT:    frintx z8.d, p0/m, z11.d
-; CHECK-NEXT:    sel z11.d, p4, z26.d, z13.d
-; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z13, z18
-; CHECK-NEXT:    frintx z13.d, p0/m, z18.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z7.d
-; CHECK-NEXT:    movprfx z18, z27
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    ldr z29, [x0, #7, mul vl]
+; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #9, mul vl]
+; CHECK-NEXT:    ldr z8, [x0, #8, mul vl]
+; CHECK-NEXT:    ldr z7, [x0, #5, mul vl]
+; CHECK-NEXT:    ldr z14, [x0, #15, mul vl]
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #14, mul vl]
+; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
+; CHECK-NEXT:    frintx z24.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z11, z10
+; CHECK-NEXT:    frintx z11.d, p0/m, z10.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    movprfx z9, z8
+; CHECK-NEXT:    frintx z9.d, p0/m, z8.d
+; CHECK-NEXT:    ldr z16, [x0, #11, mul vl]
+; CHECK-NEXT:    ldr z20, [x0, #13, mul vl]
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z4.d
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z18, [x0, #12, mul vl]
+; CHECK-NEXT:    movprfx z19, z14
+; CHECK-NEXT:    frintx z19.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z29.d, z25.d
+; CHECK-NEXT:    ldr z17, [x0, #10, mul vl]
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z1.d
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z11.d, z25.d
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.d, p0/m, z16.d
-; CHECK-NEXT:    movprfx z15, z19
-; CHECK-NEXT:    frintx z15.d, p0/m, z19.d
-; CHECK-NEXT:    movprfx z19, z28
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.d
-; CHECK-NEXT:    movprfx z21, z14
-; CHECK-NEXT:    frintx z21.d, p0/m, z14.d
-; CHECK-NEXT:    not p4.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z8.d, z7.d
-; CHECK-NEXT:    movprfx z20, z8
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z8.d
-; CHECK-NEXT:    fcmge p7.d, p0/z, z10.d, z7.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z13.d, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z9.d, p4, z26.d, z18.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z16.d, z7.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z15.d, z7.d
-; CHECK-NEXT:    movprfx z0, z16
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z16.d
-; CHECK-NEXT:    sel z14.d, p5, z26.d, z19.d
-; CHECK-NEXT:    movprfx z19, z10
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z1, z21
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z21.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z23, z15
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z15.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    sel z18.d, p6, z26.d, z20.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z21.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z22.d, z7.d
-; CHECK-NEXT:    movprfx z20, z13
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z13.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, p5/m, z26.d
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z0.d, p4/m, z26.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z21.d, z4.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z22.d, z4.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z20.d, p7/m, z26.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z22.d, z22.d
-; CHECK-NEXT:    mov z1.d, p5/m, z26.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z21.d, z21.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z25.d, z4.d
-; CHECK-NEXT:    mov z2.d, p6/m, z26.d
-; CHECK-NEXT:    sel z26.d, p1, z7.d, z29.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z16.d, z4.d
-; CHECK-NEXT:    ldr z29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z24.d, p9/m, z7.d
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z16.d, z16.d
-; CHECK-NEXT:    mov z2.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z4.d
-; CHECK-NEXT:    mov z17.d, p7/m, z7.d
-; CHECK-NEXT:    mov z29.d, p2/m, z7.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, z7.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z10.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p6/m, z7.d
+; CHECK-NEXT:    frintx z20.d, p0/m, z20.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z5.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
+; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.d, p0/m, z17.d
+; CHECK-NEXT:    fcvtzs z10.d, p3/m, z29.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    mov z3.d, x10
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z13.d, p2/m, z24.d
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z11.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z16.d, z25.d
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z9.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z25.d
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z8.d, p1/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z17.d, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z19.d
+; CHECK-NEXT:    mov z25.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.d, p0/z, z19.d, z3.d
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z15.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z3.d
 ; CHECK-NEXT:    fcmuo p6.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z8.d, z4.d
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p3, z7.d, z23.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z10.d, z10.d
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z15.d, p2, z7.d, z20.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z13.d, z13.d
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z1.d, p1, z7.d, z19.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z28.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z4.d
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z7.d, z18.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z16.d
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z20.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z24.d, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z3.d
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z18.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z19.d, z19.d
+; CHECK-NEXT:    mov z31.d, p11/m, z25.d
+; CHECK-NEXT:    sel z19.d, p3, z25.d, z21.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z18.d, z3.d
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z20.d, z20.d
+; CHECK-NEXT:    mov z27.d, p7/m, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z16.d, z3.d
+; CHECK-NEXT:    mov z13.d, p2/m, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z17.d, z3.d
+; CHECK-NEXT:    mov z0.d, p5/m, z25.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    sel z20.d, p3, z25.d, z23.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z16.d, z16.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z11.d, z3.d
+; CHECK-NEXT:    mov z19.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z4.d, z3.d
+; CHECK-NEXT:    str z31, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z17.d, z17.d
+; CHECK-NEXT:    sel z18.d, p7, z25.d, z22.d
+; CHECK-NEXT:    sel z31.d, p2, z25.d, z15.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z9.d, z3.d
+; CHECK-NEXT:    str z19, [x8, #14, mul vl]
+; CHECK-NEXT:    mov z20.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z11.d, z11.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z4.d
-; CHECK-NEXT:    sel z0.d, p1, z7.d, z14.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    sel z27.d, p4, z7.d, z9.d
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z25.d, z25.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z7.d, z31.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z27.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z14.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z29.d, z3.d
+; CHECK-NEXT:    mov z18.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z7.d, z3.d
+; CHECK-NEXT:    str z20, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z9.d, z9.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z12.d
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z29.d, z29.d
+; CHECK-NEXT:    str z18, [x8, #11, mul vl]
+; CHECK-NEXT:    sel z29.d, p5, z25.d, z10.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    str z31, [x8, #10, mul vl]
+; CHECK-NEXT:    sel z7.d, p1, z25.d, z8.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    ldr z6, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p9.d, p0/z, z5.d, z3.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    mov z13.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z6.d, z6.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z17.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z12.d, z12.d
-; CHECK-NEXT:    str z27, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z17, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z7.d
-; CHECK-NEXT:    str z24, [x8, #3, mul vl]
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z26, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z28.d, p8/m, z25.d
+; CHECK-NEXT:    mov z26.d, p9/m, z25.d
+; CHECK-NEXT:    str z29, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z30.d, p10/m, z25.d
+; CHECK-NEXT:    str z13, [x8, #6, mul vl]
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z25.d, z2.d
+; CHECK-NEXT:    mov z26.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z26, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    str z28, [x8, #2, mul vl]
+; CHECK-NEXT:    str z27, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1755,6 +1511,8 @@ define <vscale x 32 x i64> @llrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-lrint.ll b/llvm/test/CodeGen/AArch64/sve-lrint.ll
index 908ba2392a437..aa5863901b9d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lrint.ll
@@ -7,20 +7,17 @@ define <vscale x 1 x iXLen> @lrint_v1f16(<vscale x 1 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f16(<vscale x 1 x half> %x)
@@ -33,20 +30,17 @@ define <vscale x 2 x iXLen> @lrint_v2f16(<vscale x 2 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half> %x)
@@ -57,43 +51,30 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f16(<vscale x 2 x half>)
 define <vscale x 4 x iXLen> @lrint_v4f16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: lrint_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.h, w8
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z5.h
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z5.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x iXLen> %a
@@ -105,7 +86,6 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -117,8 +97,10 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z6.h, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.h, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
@@ -133,41 +115,29 @@ define <vscale x 8 x iXLen> @lrint_v8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, z4.h
 ; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z4.h
 ; CHECK-NEXT:    fcmge p4.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z25, z5
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z5.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z3.h, z6.h
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z2.h, z6.h
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z1.h, z6.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z0.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z6.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z0.d, z7.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    sel z7.d, p3, z0.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z1.h, z1.h
-; CHECK-NEXT:    sel z24.d, p4, z0.d, z25.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcvtzs z0.d, p1/m, z2.h
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z2.h, z25.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z5.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    mov z0.d, p1/m, z4.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z24.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -181,7 +151,7 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-LABEL: lrint_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -189,124 +159,110 @@ define <vscale x 16 x iXLen> @lrint_v16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mov z5.h, w8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z25.d, #0x8000000000000000
-; CHECK-NEXT:    mov z27.h, w8
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    uunpklo z24.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z3.s
+; CHECK-NEXT:    uunpklo z25.d, z1.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    frintx z4.h, p0/m, z4.h
 ; CHECK-NEXT:    frintx z2.h, p0/m, z2.h
-; CHECK-NEXT:    frintx z3.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z5.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z26, z0
+; CHECK-NEXT:    frintx z26.h, p0/m, z0.h
+; CHECK-NEXT:    uunpkhi z0.d, z1.s
 ; CHECK-NEXT:    frintx z6.h, p0/m, z6.h
-; CHECK-NEXT:    movprfx z28, z0
-; CHECK-NEXT:    frintx z28.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    frintx z29.h, p0/m, z4.h
-; CHECK-NEXT:    frintx z24.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z30, z1
-; CHECK-NEXT:    frintx z30.h, p0/m, z1.h
-; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z2.h, z5.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.h
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z27.h
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z6.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z24.h, z5.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z27.h
-; CHECK-NEXT:    fcmge p9.h, p0/z, z26.h, z5.h
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z24.h
-; CHECK-NEXT:    fcmge p10.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z26
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z26.h
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.h
-; CHECK-NEXT:    mov z1.d, p5/m, z25.d
-; CHECK-NEXT:    fcmge p5.h, p0/z, z29.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p2/m, z25.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z2.h
-; CHECK-NEXT:    movprfx z2, z28
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z28.h
-; CHECK-NEXT:    movprfx z5, z29
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z29.h
-; CHECK-NEXT:    not p7.b, p0/z, p7.b
-; CHECK-NEXT:    mov z3.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z27.h
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z29.h, z27.h
-; CHECK-NEXT:    sel z9.d, p7, z25.d, z2.d
-; CHECK-NEXT:    not p7.b, p0/z, p9.b
-; CHECK-NEXT:    mov z4.d, p6/m, z25.d
-; CHECK-NEXT:    not p6.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.h, p0/z, z28.h, z27.h
-; CHECK-NEXT:    mov z5.d, p5/m, z25.d
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z24.h, z27.h
-; CHECK-NEXT:    fcmuo p9.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    sel z6.d, p7, z25.d, z31.d
-; CHECK-NEXT:    sel z25.d, p6, z25.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z26.h, z27.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z30.h, z27.h
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    sel z2.d, p8, z7.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z7.d, z9.d
-; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z29.h, z29.h
-; CHECK-NEXT:    mov z4.d, p5/m, z7.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z24.h, z24.h
-; CHECK-NEXT:    fcmuo p10.h, p0/z, z26.h, z26.h
-; CHECK-NEXT:    mov z5.d, p3/m, z7.d
-; CHECK-NEXT:    mov z6.d, p6/m, z7.d
+; CHECK-NEXT:    movprfx z29, z3
+; CHECK-NEXT:    frintx z29.h, p0/m, z3.h
+; CHECK-NEXT:    frintx z25.h, p0/m, z25.h
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z4.h, z24.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z2.h, z24.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z5.h, z24.h
+; CHECK-NEXT:    movprfx z9, z0
+; CHECK-NEXT:    frintx z9.h, p0/m, z0.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z26.h, z24.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z6.h, z24.h
+; CHECK-NEXT:    fcmge p7.h, p0/z, z25.h, z24.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z29.h, z24.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z6.h, z1.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z25.h, z1.h
+; CHECK-NEXT:    fcmuo p9.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcvtzs z7.d, p1/m, z4.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z4.h, z1.h
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z9.h, z24.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z5.h
+; CHECK-NEXT:    fcvtzs z3.d, p4/m, z26.h
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.h
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z25.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z5.h, z1.h
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z26.h, z1.h
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z29.h
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z7.d
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z29.h, z1.h
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z9.h, z1.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z2.h, z2.h
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z30.d
+; CHECK-NEXT:    fcmuo p8.h, p0/z, z25.h, z25.h
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z27.d
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z28.d
+; CHECK-NEXT:    mov z3.d, p7/m, z10.d
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    fcmuo p7.h, p0/z, z29.h, z29.h
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z30.h, z30.h
-; CHECK-NEXT:    sel z7.d, p7, z7.d, z25.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f16(<vscale x 16 x half> %x)
@@ -319,6 +275,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -341,8 +299,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -353,230 +311,191 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov w9, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z10.s, z1.h
+; CHECK-NEXT:    uunpklo z9.s, z2.h
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    mov z26.h, w9
-; CHECK-NEXT:    uunpkhi z25.s, z1.h
+; CHECK-NEXT:    uunpkhi z12.s, z3.h
+; CHECK-NEXT:    mov z27.h, w9
 ; CHECK-NEXT:    mov w9, #31743 // =0x7bff
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpklo z31.s, z2.h
-; CHECK-NEXT:    uunpkhi z12.s, z2.h
-; CHECK-NEXT:    mov z17.d, z3.d
-; CHECK-NEXT:    uunpklo z0.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z5.s
-; CHECK-NEXT:    uunpkhi z24.d, z5.s
-; CHECK-NEXT:    uunpklo z28.d, z6.s
-; CHECK-NEXT:    uunpkhi z29.d, z6.s
-; CHECK-NEXT:    uunpklo z8.d, z25.s
-; CHECK-NEXT:    uunpkhi z9.d, z25.s
-; CHECK-NEXT:    uunpklo z16.s, z17.h
-; CHECK-NEXT:    uunpklo z11.d, z31.s
-; CHECK-NEXT:    uunpkhi z14.d, z31.s
-; CHECK-NEXT:    uunpkhi z17.s, z17.h
-; CHECK-NEXT:    movprfx z30, z4
-; CHECK-NEXT:    frintx z30.h, p0/m, z4.h
-; CHECK-NEXT:    movprfx z4, z7
-; CHECK-NEXT:    frintx z4.h, p0/m, z7.h
-; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    frintx z6.h, p0/m, z24.h
-; CHECK-NEXT:    movprfx z7, z28
-; CHECK-NEXT:    frintx z7.h, p0/m, z28.h
-; CHECK-NEXT:    movprfx z25, z29
-; CHECK-NEXT:    frintx z25.h, p0/m, z29.h
-; CHECK-NEXT:    movprfx z3, z9
-; CHECK-NEXT:    frintx z3.h, p0/m, z9.h
-; CHECK-NEXT:    mov z5.h, w9
-; CHECK-NEXT:    movprfx z31, z11
-; CHECK-NEXT:    frintx z31.h, p0/m, z11.h
-; CHECK-NEXT:    movprfx z9, z14
-; CHECK-NEXT:    frintx z9.h, p0/m, z14.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z26.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z26.h
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z30.h, z26.h
-; CHECK-NEXT:    movprfx z29, z4
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z4.h
-; CHECK-NEXT:    fcmge p6.h, p0/z, z6.h, z26.h
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.h
-; CHECK-NEXT:    movprfx z10, z6
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z6.h
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p3.h, p0/z, z7.h, z26.h
-; CHECK-NEXT:    movprfx z13, z7
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z7.h
-; CHECK-NEXT:    movprfx z15, z25
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z25.h
-; CHECK-NEXT:    not p5.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z18, z3
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z20, z31
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z31.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z21, z9
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z9.h
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z30.h, z5.h
-; CHECK-NEXT:    sel z0.d, p5, z27.d, z24.d
-; CHECK-NEXT:    not p7.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z4.h, z5.h
-; CHECK-NEXT:    mov z29.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z25.h, z26.h
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z9.h, z26.h
-; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p7, z27.d, z28.d
-; CHECK-NEXT:    movprfx z28, z8
-; CHECK-NEXT:    frintx z28.h, p0/m, z8.h
-; CHECK-NEXT:    sel z8.d, p5, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z12.s
-; CHECK-NEXT:    uunpkhi z12.d, z12.s
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    sel z11.d, p3, z27.d, z13.d
-; CHECK-NEXT:    uunpklo z13.d, z16.s
-; CHECK-NEXT:    fcmge p3.h, p0/z, z3.h, z26.h
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z24.d, p5, z27.d, z15.d
-; CHECK-NEXT:    uunpkhi z15.d, z16.s
-; CHECK-NEXT:    movprfx z14, z28
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z28.h
-; CHECK-NEXT:    frintx z10.h, p0/m, z10.h
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    frintx z12.h, p0/m, z12.h
-; CHECK-NEXT:    uunpkhi z17.d, z17.s
-; CHECK-NEXT:    movprfx z19, z13
-; CHECK-NEXT:    frintx z19.h, p0/m, z13.h
-; CHECK-NEXT:    fcmge p4.h, p0/z, z28.h, z26.h
-; CHECK-NEXT:    fcmge p5.h, p0/z, z31.h, z26.h
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
-; CHECK-NEXT:    fcmge p7.h, p0/z, z10.h, z26.h
+; CHECK-NEXT:    uunpkhi z14.s, z2.h
+; CHECK-NEXT:    uunpklo z15.s, z3.h
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    uunpklo z5.d, z4.s
+; CHECK-NEXT:    uunpkhi z6.d, z4.s
+; CHECK-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEXT:    uunpkhi z26.d, z0.s
+; CHECK-NEXT:    uunpklo z8.d, z10.s
+; CHECK-NEXT:    uunpkhi z11.d, z10.s
+; CHECK-NEXT:    uunpklo z10.d, z9.s
+; CHECK-NEXT:    uunpkhi z13.d, z9.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    uunpklo z16.d, z12.s
+; CHECK-NEXT:    uunpklo z18.d, z14.s
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    frintx z1.h, p0/m, z7.h
+; CHECK-NEXT:    movprfx z4, z5
+; CHECK-NEXT:    frintx z4.h, p0/m, z5.h
+; CHECK-NEXT:    movprfx z5, z6
+; CHECK-NEXT:    frintx z5.h, p0/m, z6.h
+; CHECK-NEXT:    movprfx z7, z29
+; CHECK-NEXT:    frintx z7.h, p0/m, z29.h
+; CHECK-NEXT:    movprfx z6, z26
+; CHECK-NEXT:    frintx z6.h, p0/m, z26.h
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.h, p0/m, z11.h
+; CHECK-NEXT:    movprfx z3, z10
+; CHECK-NEXT:    frintx z3.h, p0/m, z10.h
+; CHECK-NEXT:    movprfx z10, z13
+; CHECK-NEXT:    frintx z10.h, p0/m, z13.h
+; CHECK-NEXT:    uunpkhi z26.d, z25.s
+; CHECK-NEXT:    uunpkhi z13.d, z12.s
+; CHECK-NEXT:    frintx z8.h, p0/m, z8.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z1.h, z27.h
+; CHECK-NEXT:    uunpkhi z14.d, z14.s
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z7.h, z27.h
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.h, p0/z, z4.h, z27.h
+; CHECK-NEXT:    fcmge p5.h, p0/z, z5.h, z27.h
+; CHECK-NEXT:    uunpklo z19.d, z15.s
+; CHECK-NEXT:    uunpkhi z15.d, z15.s
+; CHECK-NEXT:    movprfx z20, z13
+; CHECK-NEXT:    frintx z20.h, p0/m, z13.h
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.h, p0/z, z6.h, z27.h
+; CHECK-NEXT:    frintx z26.h, p0/m, z26.h
+; CHECK-NEXT:    fcvtzs z29.d, p3/m, z1.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z9.h, z27.h
+; CHECK-NEXT:    mov z11.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z31.d, p2/m, z7.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z8.h, z27.h
+; CHECK-NEXT:    mov z17.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.h, p0/m, z16.h
-; CHECK-NEXT:    fcmge p8.h, p0/z, z12.h, z26.h
-; CHECK-NEXT:    frintx z17.h, p0/m, z17.h
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.h
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z13.d, p3, z27.d, z18.d
-; CHECK-NEXT:    fcmge p3.h, p0/z, z19.h, z26.h
-; CHECK-NEXT:    movprfx z0, z15
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z15.h
-; CHECK-NEXT:    sel z22.d, p4, z27.d, z14.d
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z21.d
-; CHECK-NEXT:    movprfx z21, z12
-; CHECK-NEXT:    fcvtzs z21.d, p0/m, z12.h
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.h
-; CHECK-NEXT:    sel z14.d, p5, z27.d, z20.d
-; CHECK-NEXT:    fcmge p4.h, p0/z, z15.h, z26.h
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.h
-; CHECK-NEXT:    movprfx z2, z17
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z17.h
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z26.h
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.h, p0/z, z17.h, z26.h
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    mov z21.d, p7/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z17.h, z5.h
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z16.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z16.h, z16.h
-; CHECK-NEXT:    mov z29.d, p2/m, z26.d
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    ldr z27, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z12.h, z5.h
-; CHECK-NEXT:    fcmuo p8.h, p0/z, z17.h, z17.h
-; CHECK-NEXT:    fcmgt p7.h, p0/z, z28.h, z5.h
-; CHECK-NEXT:    mov z1.d, p4/m, z26.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z15.h, z15.h
-; CHECK-NEXT:    mov z8.d, p9/m, z26.d
-; CHECK-NEXT:    mov z27.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z15.h, z5.h
-; CHECK-NEXT:    mov z2.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z5.h
-; CHECK-NEXT:    mov z11.d, p6/m, z26.d
-; CHECK-NEXT:    fcmuo p6.h, p0/z, z19.h, z19.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z5.h
-; CHECK-NEXT:    sel z15.d, p2, z26.d, z21.d
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z12.h, z12.h
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p7, z26.d, z22.d
-; CHECK-NEXT:    mov z0.d, p1/m, z26.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z10.h, z5.h
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z17.d, p3, z26.d, z23.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z10.h, z10.h
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z26.d, z18.d
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z9.h, z9.h
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z3.h, z5.h
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z26.d, z20.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z31.h, z5.h
-; CHECK-NEXT:    mov z17.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z31.h, z31.h
+; CHECK-NEXT:    frintx z14.h, p0/m, z14.h
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z4.h
+; CHECK-NEXT:    fcvtzs z28.d, p5/m, z5.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z3.h, z27.h
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z10.h, z27.h
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z19.h, p0/m, z19.h
+; CHECK-NEXT:    frintx z15.h, p0/m, z15.h
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z30.d, p1/m, z6.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z26.h, z27.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    frintx z18.h, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z11.d, p3/m, z9.h
+; CHECK-NEXT:    fcmge p3.h, p0/z, z20.h, z27.h
+; CHECK-NEXT:    mov z25.h, w9
+; CHECK-NEXT:    fcvtzs z17.d, p2/m, z8.h
+; CHECK-NEXT:    fcmge p6.h, p0/z, z16.h, z27.h
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.h, p0/z, z14.h, z27.h
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z3.h
+; CHECK-NEXT:    fcvtzs z13.d, p5/m, z10.h
+; CHECK-NEXT:    fcmge p4.h, p0/z, z19.h, z27.h
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.h, p0/z, z15.h, z27.h
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z26.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z18.h, z27.h
+; CHECK-NEXT:    fcvtzs z24.d, p3/m, z20.h
+; CHECK-NEXT:    mov z27.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.h, p0/z, z20.h, z25.h
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z16.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z16.h, z25.h
+; CHECK-NEXT:    fcmuo p6.h, p0/z, z16.h, z16.h
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z14.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z8.h, z25.h
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z5.h, z25.h
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z19.h
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z15.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z20.h, z20.h
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z15.h, z25.h
+; CHECK-NEXT:    mov z24.d, p11/m, z27.d
+; CHECK-NEXT:    sel z20.d, p3, z27.d, z21.d
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z19.h, z25.h
+; CHECK-NEXT:    fcmgt p8.h, p0/z, z1.h, z25.h
+; CHECK-NEXT:    mov z17.d, p2/m, z27.d
+; CHECK-NEXT:    fcvtzs z16.d, p1/m, z18.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z18.h, z25.h
+; CHECK-NEXT:    mov z28.d, p7/m, z27.d
+; CHECK-NEXT:    fcmgt p7.h, p0/z, z14.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z15.h, z15.h
+; CHECK-NEXT:    mov z0.d, p5/m, z27.d
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z10.h, z25.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z19.h, z19.h
+; CHECK-NEXT:    sel z19.d, p3, z27.d, z23.d
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z14.h, z14.h
+; CHECK-NEXT:    mov z20.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p9.h, p0/z, z6.h, z25.h
+; CHECK-NEXT:    fcmgt p10.h, p0/z, z7.h, z25.h
+; CHECK-NEXT:    str z24, [x8, #15, mul vl]
+; CHECK-NEXT:    sel z24.d, p2, z27.d, z16.d
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z25.h
+; CHECK-NEXT:    sel z15.d, p7, z27.d, z22.d
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z13.d, p5/m, z27.d
+; CHECK-NEXT:    str z20, [x8, #14, mul vl]
+; CHECK-NEXT:    fcmgt p5.h, p0/z, z9.h, z25.h
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z18.h, z18.h
+; CHECK-NEXT:    mov z19.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z10.h, z10.h
+; CHECK-NEXT:    mov z29.d, p8/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z25.h, z25.h
-; CHECK-NEXT:    str z17, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z25.h, z5.h
+; CHECK-NEXT:    mov z15.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z3.h, z3.h
+; CHECK-NEXT:    sel z0.d, p2, z27.d, z12.d
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z9.h, z9.h
+; CHECK-NEXT:    mov z30.d, p9/m, z27.d
+; CHECK-NEXT:    str z19, [x8, #12, mul vl]
+; CHECK-NEXT:    sel z3.d, p5, z27.d, z11.d
+; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    sel z0.d, p1, z26.d, z14.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    sel z3.d, p4, z26.d, z13.d
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z28.h, z28.h
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    sel z1.d, p3, z26.d, z24.d
-; CHECK-NEXT:    fcmuo p3.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    ldr z7, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z6.h, z6.h
-; CHECK-NEXT:    mov z16.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.h, p0/z, z4.h, z4.h
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z7.h, z5.h
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.h, p0/z, z30.h, z30.h
+; CHECK-NEXT:    fcmgt p1.h, p0/z, z26.h, z25.h
+; CHECK-NEXT:    mov z13.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z8.h, z8.h
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z26.h, z26.h
+; CHECK-NEXT:    str z24, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z4.h, z25.h
+; CHECK-NEXT:    str z13, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmuo p2.h, p0/z, z6.h, z6.h
+; CHECK-NEXT:    mov z31.d, p10/m, z27.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z7.h, z7.h
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p1/m, z27.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z7.h, z7.h
 ; CHECK-NEXT:    str z3, [x8, #7, mul vl]
-; CHECK-NEXT:    ldr z0, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z16, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z26.d
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z17.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z4.h, z4.h
+; CHECK-NEXT:    str z17, [x8, #6, mul vl]
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, z27.d
+; CHECK-NEXT:    mov z29.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z2, [x8, #5, mul vl]
+; CHECK-NEXT:    str z31, [x8, #4, mul vl]
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z29, [x8, #2, mul vl]
+; CHECK-NEXT:    str z28, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -593,6 +512,8 @@ define <vscale x 32 x iXLen> @lrint_v32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -612,20 +533,17 @@ define <vscale x 1 x iXLen> @lrint_v1f32(<vscale x 1 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float> %x)
@@ -638,20 +556,17 @@ define <vscale x 2 x iXLen> @lrint_v2f32(<vscale x 2 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
-; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x)
@@ -662,43 +577,30 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float>)
 define <vscale x 4 x iXLen> @lrint_v4f32(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: lrint_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    mov z5.s, w8
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z5, z0
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z1.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z5.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x iXLen> %a
@@ -710,7 +612,6 @@ define <vscale x 8 x iXLen> @lrint_v8f32(<vscale x 8 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -719,57 +620,47 @@ define <vscale x 8 x iXLen> @lrint_v8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
-; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
-; CHECK-NEXT:    mov z6.s, w8
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    mov z25.s, w8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z2.s, p0/m, z2.s
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z3.s, p0/m, z3.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    movprfx z7, z0
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmge p3.s, p0/z, z3.s, z4.s
 ; CHECK-NEXT:    fcmge p4.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.s
-; CHECK-NEXT:    movprfx z25, z1
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z3.s, z6.s
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z0.s, z6.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z6.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z2.s, z2.s
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z3.s, z3.s
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z1.s, z25.s
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z2.s
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z2.s, z25.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z25.s
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z3.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z3.s, z25.s
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z1.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z3.s, z3.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -783,7 +674,7 @@ define <vscale x 16 x iXLen> @lrint_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-LABEL: lrint_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -791,119 +682,106 @@ define <vscale x 16 x iXLen> @lrint_v16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
 ; CHECK-NEXT:    uunpklo z4.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    frintx z5.s, p0/m, z4.s
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    frintx z6.s, p0/m, z0.s
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
-; CHECK-NEXT:    movprfx z28, z1
-; CHECK-NEXT:    frintx z28.s, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z7.d, z3.s
+; CHECK-NEXT:    mov z24.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    frintx z24.s, p0/m, z24.s
-; CHECK-NEXT:    movprfx z29, z2
-; CHECK-NEXT:    frintx z29.s, p0/m, z2.s
-; CHECK-NEXT:    frintx z25.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z30, z3
-; CHECK-NEXT:    frintx z30.s, p0/m, z3.s
-; CHECK-NEXT:    mov z27.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z4.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z6.s, z4.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z7.s, z4.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z28.s, z4.s
-; CHECK-NEXT:    movprfx z3, z7
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z29.s, z4.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z5.s, z27.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z6.s, z27.s
-; CHECK-NEXT:    fcmge p9.s, p0/z, z25.s, z4.s
-; CHECK-NEXT:    movprfx z31, z25
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z25.s
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z28.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmge p10.s, p0/z, z30.s, z4.s
-; CHECK-NEXT:    movprfx z8, z30
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z30.s
-; CHECK-NEXT:    mov z1.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z24.s, z4.s
-; CHECK-NEXT:    movprfx z4, z29
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z29.s
-; CHECK-NEXT:    mov z2.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    movprfx z6, z24
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z24.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z3.d, p5/m, z0.d
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    mov z5.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p8.s, p0/z, z7.s, z27.s
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
-; CHECK-NEXT:    mov z6.d, p4/m, z0.d
-; CHECK-NEXT:    fcmuo p9.s, p0/z, z7.s, z7.s
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.s, p0/z, z28.s, z27.s
-; CHECK-NEXT:    sel z7.d, p5, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z24.s, z27.s
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z30.s, z27.s
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z26.d, z1.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z29.s, z27.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z25.s, z27.s
-; CHECK-NEXT:    sel z1.d, p7, z26.d, z2.d
-; CHECK-NEXT:    fcmuo p7.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    sel z2.d, p8, z26.d, z3.d
-; CHECK-NEXT:    sel z3.d, p10, z26.d, z5.d
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z29.s, z29.s
-; CHECK-NEXT:    sel z4.d, p5, z26.d, z6.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z24.s, z24.s
-; CHECK-NEXT:    fcmuo p10.s, p0/z, z25.s, z25.s
-; CHECK-NEXT:    sel z5.d, p3, z26.d, z7.d
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    sel z7.d, p6, z26.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z4.s, p0/m, z4.s
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z30.s, w8
+; CHECK-NEXT:    movprfx z27, z2
+; CHECK-NEXT:    frintx z27.s, p0/m, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z3.s
+; CHECK-NEXT:    frintx z6.s, p0/m, z6.s
+; CHECK-NEXT:    movprfx z25, z1
+; CHECK-NEXT:    frintx z25.s, p0/m, z1.s
+; CHECK-NEXT:    frintx z5.s, p0/m, z5.s
+; CHECK-NEXT:    frintx z7.s, p0/m, z7.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z4.s, z24.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z24.s
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z9, z2
+; CHECK-NEXT:    frintx z9.s, p0/m, z2.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z6.s, z24.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z5.s, z24.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z25.s, z24.s
+; CHECK-NEXT:    fcmge p7.s, p0/z, z7.s, z24.s
+; CHECK-NEXT:    fcmge p6.s, p0/z, z27.s, z24.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    mov z10.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z4.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z4.s, z30.s
+; CHECK-NEXT:    fcvtzs z26.d, p2/m, z0.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z24.s
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z6.s
+; CHECK-NEXT:    fcvtzs z3.d, p3/m, z5.s
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p7/m, z7.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z30.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z5.s, z30.s
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z25.s, z30.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z6.s, z30.s
+; CHECK-NEXT:    fcvtzs z31.d, p6/m, z27.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z0.d, p1, z10.d, z1.d
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z30.s
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z7.s, z30.s
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z9.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z30.s
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z4.s, z4.s
+; CHECK-NEXT:    fcmuo p9.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    sel z1.d, p4, z10.d, z26.d
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    sel z2.d, p5, z10.d, z3.d
+; CHECK-NEXT:    sel z3.d, p7, z10.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z10.d, z29.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z6.s, z6.s
+; CHECK-NEXT:    fcmuo p7.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    fcmuo p8.s, p0/z, z7.s, z7.s
+; CHECK-NEXT:    sel z5.d, p1, z10.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z10.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z10.d, z24.d
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z6.d, p4, z26.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float> %x)
@@ -916,6 +794,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -938,8 +818,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -950,224 +830,185 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    uunpklo z24.d, z0.s
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z25.d, z0.s
 ; CHECK-NEXT:    mov w9, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z26.d, z1.s
-; CHECK-NEXT:    uunpkhi z25.d, z0.s
-; CHECK-NEXT:    uunpkhi z28.d, z1.s
-; CHECK-NEXT:    mov z29.s, w9
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEXT:    uunpkhi z9.d, z2.s
+; CHECK-NEXT:    uunpklo z11.d, z3.s
+; CHECK-NEXT:    uunpkhi z12.d, z3.s
+; CHECK-NEXT:    mov z10.s, w9
 ; CHECK-NEXT:    mov w9, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    mov z27.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z30.d, z2.s
-; CHECK-NEXT:    uunpklo z8.d, z3.s
 ; CHECK-NEXT:    movprfx z0, z24
 ; CHECK-NEXT:    frintx z0.s, p0/m, z24.s
-; CHECK-NEXT:    uunpkhi z9.d, z3.s
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    frintx z24.s, p0/m, z25.s
+; CHECK-NEXT:    uunpklo z13.d, z4.s
+; CHECK-NEXT:    movprfx z25, z26
+; CHECK-NEXT:    frintx z25.s, p0/m, z26.s
+; CHECK-NEXT:    movprfx z26, z1
+; CHECK-NEXT:    frintx z26.s, p0/m, z1.s
 ; CHECK-NEXT:    uunpkhi z14.d, z4.s
-; CHECK-NEXT:    movprfx z24, z26
-; CHECK-NEXT:    frintx z24.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z1, z25
-; CHECK-NEXT:    frintx z1.s, p0/m, z25.s
-; CHECK-NEXT:    movprfx z5, z28
-; CHECK-NEXT:    frintx z5.s, p0/m, z28.s
-; CHECK-NEXT:    uunpklo z26.d, z2.s
-; CHECK-NEXT:    uunpklo z16.d, z17.s
-; CHECK-NEXT:    mov z25.s, w9
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    frintx z28.s, p0/m, z30.s
-; CHECK-NEXT:    movprfx z30, z8
-; CHECK-NEXT:    frintx z30.s, p0/m, z8.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z29.s
-; CHECK-NEXT:    movprfx z31, z0
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z0.s
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z24.s, z29.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z5.s, z29.s
-; CHECK-NEXT:    frintx z26.s, p0/m, z26.s
-; CHECK-NEXT:    movprfx z10, z1
-; CHECK-NEXT:    fcvtzs z10.d, p0/m, z1.s
-; CHECK-NEXT:    movprfx z11, z24
-; CHECK-NEXT:    fcvtzs z11.d, p0/m, z24.s
-; CHECK-NEXT:    movprfx z12, z5
-; CHECK-NEXT:    fcvtzs z12.d, p0/m, z5.s
-; CHECK-NEXT:    movprfx z15, z28
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z28.s
-; CHECK-NEXT:    str z1, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z25.s
-; CHECK-NEXT:    fcmgt p9.s, p0/z, z5.s, z25.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z0.d, p4, z27.d, z31.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z26.s, z29.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    movprfx z13, z26
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z26.s
-; CHECK-NEXT:    sel z31.d, p2, z27.d, z10.d
-; CHECK-NEXT:    uunpklo z10.d, z4.s
-; CHECK-NEXT:    sel z8.d, p3, z27.d, z11.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z28.s, z29.s
-; CHECK-NEXT:    sel z11.d, p5, z27.d, z12.d
-; CHECK-NEXT:    movprfx z4, z9
-; CHECK-NEXT:    frintx z4.s, p0/m, z9.s
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    not p5.b, p0/z, p4.b
-; CHECK-NEXT:    fcmge p4.s, p0/z, z30.s, z29.s
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z24.s, z25.s
-; CHECK-NEXT:    sel z12.d, p5, z27.d, z13.d
-; CHECK-NEXT:    uunpkhi z13.d, z17.s
-; CHECK-NEXT:    movprfx z9, z10
-; CHECK-NEXT:    frintx z9.s, p0/m, z10.s
-; CHECK-NEXT:    movprfx z10, z14
-; CHECK-NEXT:    frintx z10.s, p0/m, z14.s
-; CHECK-NEXT:    uunpkhi z17.d, z6.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    uunpklo z14.d, z6.s
-; CHECK-NEXT:    movprfx z6, z16
-; CHECK-NEXT:    frintx z6.s, p0/m, z16.s
-; CHECK-NEXT:    uunpklo z16.d, z7.s
+; CHECK-NEXT:    movprfx z2, z27
+; CHECK-NEXT:    frintx z2.s, p0/m, z27.s
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z27, z9
+; CHECK-NEXT:    frintx z27.s, p0/m, z9.s
+; CHECK-NEXT:    movprfx z9, z11
+; CHECK-NEXT:    frintx z9.s, p0/m, z11.s
+; CHECK-NEXT:    movprfx z11, z12
+; CHECK-NEXT:    frintx z11.s, p0/m, z12.s
+; CHECK-NEXT:    uunpklo z15.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:    sel z3.d, p3, z27.d, z15.d
-; CHECK-NEXT:    fcmge p3.s, p0/z, z4.s, z29.s
-; CHECK-NEXT:    frintx z13.s, p0/m, z13.s
-; CHECK-NEXT:    movprfx z15, z30
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z30.s
-; CHECK-NEXT:    fcmge p5.s, p0/z, z9.s, z29.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z10.s, z29.s
-; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
-; CHECK-NEXT:    movprfx z18, z4
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z4.s
-; CHECK-NEXT:    movprfx z20, z10
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z10.s
-; CHECK-NEXT:    frintx z16.s, p0/m, z16.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z19, z14
-; CHECK-NEXT:    frintx z19.s, p0/m, z14.s
-; CHECK-NEXT:    movprfx z14, z9
-; CHECK-NEXT:    fcvtzs z14.d, p0/m, z9.s
-; CHECK-NEXT:    fcmge p7.s, p0/z, z6.s, z29.s
-; CHECK-NEXT:    fcmge p8.s, p0/z, z13.s, z29.s
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p3.s, p0/z, z26.s, z10.s
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z0.s, z10.s
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.s, p0/z, z24.s, z10.s
+; CHECK-NEXT:    movprfx z12, z13
+; CHECK-NEXT:    frintx z12.s, p0/m, z13.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z25.s, z10.s
+; CHECK-NEXT:    fcmge p4.s, p0/z, z2.s, z10.s
+; CHECK-NEXT:    movprfx z13, z14
+; CHECK-NEXT:    frintx z13.s, p0/m, z14.s
+; CHECK-NEXT:    uunpklo z17.d, z5.s
+; CHECK-NEXT:    uunpkhi z18.d, z5.s
 ; CHECK-NEXT:    movprfx z21, z7
 ; CHECK-NEXT:    frintx z21.s, p0/m, z7.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    mov z15.d, p4/m, z27.d
-; CHECK-NEXT:    fcmge p4.s, p0/z, z17.s, z29.s
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z7.d, p3, z27.d, z18.d
-; CHECK-NEXT:    movprfx z0, z17
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z17.s
-; CHECK-NEXT:    sel z18.d, p6, z27.d, z20.d
-; CHECK-NEXT:    movprfx z20, z6
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z6.s
-; CHECK-NEXT:    fcmge p6.s, p0/z, z16.s, z29.s
-; CHECK-NEXT:    fcmge p3.s, p0/z, z19.s, z29.s
-; CHECK-NEXT:    mov z14.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.s, p0/z, z21.s, z29.s
-; CHECK-NEXT:    movprfx z1, z16
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z16.s
-; CHECK-NEXT:    movprfx z22, z13
-; CHECK-NEXT:    fcvtzs z22.d, p0/m, z13.s
-; CHECK-NEXT:    movprfx z23, z19
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z19.s
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z2, z21
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z21.s
-; CHECK-NEXT:    mov z29.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z20.d, p5/m, z27.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z0.d, p4/m, z27.d
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z16.s, z25.s
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z1.d, p5/m, z27.d
-; CHECK-NEXT:    mov z22.d, p7/m, z27.d
-; CHECK-NEXT:    mov z23.d, p3/m, z27.d
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z21.s, z25.s
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z16.s, z16.s
-; CHECK-NEXT:    mov z2.d, p6/m, z27.d
-; CHECK-NEXT:    sel z27.d, p1, z29.d, z31.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z17.s, z25.s
-; CHECK-NEXT:    mov z1.d, p4/m, z29.d
-; CHECK-NEXT:    fcmgt p6.s, p0/z, z26.s, z25.s
-; CHECK-NEXT:    fcmgt p7.s, p0/z, z30.s, z25.s
-; CHECK-NEXT:    sel z31.d, p2, z29.d, z8.d
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z13.s, z25.s
-; CHECK-NEXT:    fcmuo p8.s, p0/z, z21.s, z21.s
-; CHECK-NEXT:    mov z2.d, p3/m, z29.d
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z17.s, z17.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z25.s
-; CHECK-NEXT:    mov z0.d, p1/m, z29.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z6.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    sel z8.d, p9, z29.d, z11.d
-; CHECK-NEXT:    sel z11.d, p6, z29.d, z12.d
-; CHECK-NEXT:    sel z12.d, p7, z29.d, z15.d
-; CHECK-NEXT:    fcmgt p5.s, p0/z, z10.s, z25.s
-; CHECK-NEXT:    sel z15.d, p2, z29.d, z22.d
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z13.s, z13.s
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
+; CHECK-NEXT:    uunpklo z19.d, z6.s
+; CHECK-NEXT:    uunpkhi z20.d, z6.s
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z26.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z11.s, z10.s
+; CHECK-NEXT:    mov z5.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z15.s, p0/m, z15.s
+; CHECK-NEXT:    fcvtzs z1.d, p5/m, z0.s
+; CHECK-NEXT:    fcvtzs z29.d, p1/m, z24.s
+; CHECK-NEXT:    fcvtzs z30.d, p2/m, z25.s
+; CHECK-NEXT:    fcvtzs z8.d, p4/m, z2.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z27.s, z10.s
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p2.s, p0/z, z9.s, z10.s
+; CHECK-NEXT:    mov z16.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z12.s, z10.s
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.s, p0/z, z13.s, z10.s
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.s, p0/m, z17.s
+; CHECK-NEXT:    frintx z18.s, p0/m, z18.s
+; CHECK-NEXT:    frintx z19.s, p0/m, z19.s
+; CHECK-NEXT:    frintx z20.s, p0/m, z20.s
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z5.d, p3/m, z11.s
+; CHECK-NEXT:    fcmge p3.s, p0/z, z21.s, z10.s
+; CHECK-NEXT:    mov z3.s, w9
+; CHECK-NEXT:    fcmge p6.s, p0/z, z15.s, z10.s
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z4.d, p1/m, z27.s
+; CHECK-NEXT:    fcvtzs z16.d, p2/m, z9.s
+; CHECK-NEXT:    fcvtzs z6.d, p4/m, z12.s
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z13.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z17.s, z10.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z18.s, z10.s
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p4.s, p0/z, z19.s, z10.s
+; CHECK-NEXT:    fcmge p5.s, p0/z, z20.s, z10.s
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z21.s
+; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.s, p0/z, z21.s, z3.s
+; CHECK-NEXT:    fcvtzs z22.d, p6/m, z15.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z15.s, z3.s
+; CHECK-NEXT:    fcmuo p6.s, p0/z, z15.s, z15.s
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z24.s, z3.s
+; CHECK-NEXT:    fcvtzs z23.d, p2/m, z18.s
+; CHECK-NEXT:    fcvtzs z10.d, p5/m, z20.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z9.s, z3.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z20.s, z3.s
+; CHECK-NEXT:    fcvtzs z0.d, p4/m, z19.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z21.s, z21.s
+; CHECK-NEXT:    mov z28.d, p11/m, z7.d
+; CHECK-NEXT:    sel z21.d, p3, z7.d, z22.d
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z19.s, z3.s
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.s
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z20.s, z20.s
+; CHECK-NEXT:    mov z29.d, p7/m, z7.d
+; CHECK-NEXT:    fcmgt p7.s, p0/z, z18.s, z3.s
+; CHECK-NEXT:    mov z16.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z17.s, z3.s
+; CHECK-NEXT:    mov z10.d, p5/m, z7.d
+; CHECK-NEXT:    mov z28.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z19.s, z19.s
+; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z18.s, z18.s
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z13.s, z3.s
+; CHECK-NEXT:    mov z21.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    fcmgt p8.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    str z28, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z10.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z17.s, z17.s
+; CHECK-NEXT:    sel z19.d, p7, z7.d, z23.d
+; CHECK-NEXT:    sel z28.d, p2, z7.d, z15.d
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z12.s, z3.s
+; CHECK-NEXT:    str z21, [x8, #14, mul vl]
 ; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p1, z29.d, z20.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z9.s, z25.s
-; CHECK-NEXT:    fcmuo p6.s, p0/z, z19.s, z19.s
-; CHECK-NEXT:    sel z16.d, p3, z29.d, z23.d
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z6.s, z6.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z4.s, z25.s
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z29.d, z18.d
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z10.s, z10.s
-; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.s, p0/z, z9.s, z9.s
-; CHECK-NEXT:    sel z0.d, p1, z29.d, z14.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z4.s, z4.s
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z28.s, z25.s
-; CHECK-NEXT:    sel z4.d, p4, z29.d, z7.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z28.s, z28.s
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.s, p0/z, z30.s, z30.s
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z5.s, z5.s
-; CHECK-NEXT:    sel z1.d, p3, z29.d, z3.d
-; CHECK-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
+; CHECK-NEXT:    mov z14.d, p5/m, z7.d
+; CHECK-NEXT:    str z10, [x8, #13, mul vl]
+; CHECK-NEXT:    fcmgt p5.s, p0/z, z11.s, z3.s
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z13.s, z13.s
+; CHECK-NEXT:    mov z19.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z27.s, z3.s
+; CHECK-NEXT:    str z0, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.s, p0/z, z12.s, z12.s
+; CHECK-NEXT:    sel z0.d, p2, z7.d, z6.d
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z11.s, z11.s
+; CHECK-NEXT:    fcmgt p9.s, p0/z, z26.s, z3.s
+; CHECK-NEXT:    mov z30.d, p8/m, z7.d
+; CHECK-NEXT:    str z19, [x8, #11, mul vl]
+; CHECK-NEXT:    mov z5.d, p5/m, z7.d
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z9.s, z9.s
+; CHECK-NEXT:    str z28, [x8, #10, mul vl]
+; CHECK-NEXT:    mov z4.d, p1/m, z7.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    ldr z2, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.s, p0/z, z27.s, z27.s
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z5.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.s, p0/z, z26.s, z26.s
+; CHECK-NEXT:    mov z16.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.s, p0/z, z25.s, z25.s
+; CHECK-NEXT:    mov z31.d, p9/m, z7.d
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    mov z8.d, p10/m, z7.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    fcmuo p3.s, p0/z, z26.s, z26.s
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    str z4, [x8, #7, mul vl]
-; CHECK-NEXT:    mov z12.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z3.s, z25.s
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p4.s, p0/z, z24.s, z24.s
+; CHECK-NEXT:    str z5, [x8, #7, mul vl]
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z2.s, z2.s
+; CHECK-NEXT:    mov z31.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    str z16, [x8, #6, mul vl]
 ; CHECK-NEXT:    mov z8.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str z12, [x8, #6, mul vl]
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z3.s, z3.s
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    str z8, [x8, #3, mul vl]
-; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z29.d
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z27.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z31, [x8, #2, mul vl]
+; CHECK-NEXT:    mov z30.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    str z4, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z7.d, z1.d
+; CHECK-NEXT:    str z31, [x8, #3, mul vl]
+; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z8, [x8, #4, mul vl]
+; CHECK-NEXT:    str z30, [x8, #2, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    str z27, [x8, #1, mul vl]
+; CHECK-NEXT:    str z29, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1184,6 +1025,8 @@ define <vscale x 32 x iXLen> @lrint_v32f32(<vscale x 32 x float> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
@@ -1203,20 +1046,17 @@ define <vscale x 1 x iXLen> @lrint_v1f64(<vscale x 1 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double> %x)
@@ -1229,20 +1069,17 @@ define <vscale x 2 x iXLen> @lrint_v2f64(<vscale x 2 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov z1.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z1.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double> %x)
@@ -1253,41 +1090,28 @@ declare <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double>
 define <vscale x 4 x iXLen> @lrint_v4f64(<vscale x 4 x double> %x) {
 ; CHECK-LABEL: lrint_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
-; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z5, z1
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    fcvtzs z3.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p1, z2.d, z3.d
+; CHECK-NEXT:    sel z1.d, p2, z2.d, z4.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x iXLen> %a
@@ -1299,7 +1123,6 @@ define <vscale x 8 x iXLen> @lrint_v8f64(<vscale x 8 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -1309,52 +1132,42 @@ define <vscale x 8 x iXLen> @lrint_v8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z5.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov z6.d, #0x8000000000000000
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z26.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov z6.d, x8
+; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z7.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z7, z1
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z24, z2
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z2.d
-; CHECK-NEXT:    movprfx z25, z3
-; CHECK-NEXT:    fcvtzs z25.d, p0/m, z3.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z4.d, p1/m, z5.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z4.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcvtzs z5.d, p1/m, z0.d
+; CHECK-NEXT:    fcvtzs z6.d, p2/m, z1.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcvtzs z7.d, p3/m, z2.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z2.d, z25.d
+; CHECK-NEXT:    fcvtzs z24.d, p4/m, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z4.d, z5.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z2.d, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
+; CHECK-NEXT:    sel z1.d, p2, z4.d, z6.d
+; CHECK-NEXT:    sel z2.d, p3, z4.d, z7.d
+; CHECK-NEXT:    sel z3.d, p5, z4.d, z24.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1368,7 +1181,7 @@ define <vscale x 16 x iXLen> @lrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-LABEL: lrint_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
@@ -1376,109 +1189,93 @@ define <vscale x 16 x iXLen> @lrint_v16f64(<vscale x 16 x double> %x) {
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z24.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z25.d, x8
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z24.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    movprfx z26, z0
-; CHECK-NEXT:    frintx z26.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z27, z1
-; CHECK-NEXT:    frintx z27.d, p0/m, z1.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z25, z4
+; CHECK-NEXT:    frintx z25.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z2.d, p0/m, z2.d
-; CHECK-NEXT:    mov z0.d, #0x8000000000000000
-; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z28, z4
-; CHECK-NEXT:    frintx z28.d, p0/m, z4.d
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
+; CHECK-NEXT:    mov z30.d, x8
+; CHECK-NEXT:    mov z4.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z26.d, z25.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z27.d, z25.d
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z26.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z2.d, z25.d
-; CHECK-NEXT:    movprfx z29, z27
-; CHECK-NEXT:    fcvtzs z29.d, p0/m, z27.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z26.d, z1.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z25.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z27.d, z1.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z25.d
-; CHECK-NEXT:    movprfx z30, z28
-; CHECK-NEXT:    fcvtzs z30.d, p0/m, z28.d
-; CHECK-NEXT:    fcmge p10.d, p0/z, z7.d, z25.d
-; CHECK-NEXT:    not p4.b, p0/z, p1.b
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z26.d, z26.d
-; CHECK-NEXT:    movprfx z26, z2
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z2.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    movprfx z31, z6
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z8, z7
-; CHECK-NEXT:    fcvtzs z8.d, p0/m, z7.d
-; CHECK-NEXT:    mov z4.d, p4/m, z0.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z28.d, z25.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    mov z29.d, p2/m, z0.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    movprfx z27, z3
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z3.d
-; CHECK-NEXT:    sel z25.d, p5, z0.d, z26.d
-; CHECK-NEXT:    movprfx z26, z5
-; CHECK-NEXT:    fcvtzs z26.d, p0/m, z5.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    not p5.b, p0/z, p8.b
-; CHECK-NEXT:    fcmgt p8.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z27.d, p6/m, z0.d
-; CHECK-NEXT:    not p6.b, p0/z, p9.b
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
+; CHECK-NEXT:    mov z29.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z24.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z24.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z25.d, z24.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z2.d, z24.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z3.d, z24.d
+; CHECK-NEXT:    fcmge p7.d, p0/z, z5.d, z24.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p6.d, p0/z, z6.d, z24.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    mov z9.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z25.d, z30.d
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z30.d
+; CHECK-NEXT:    fcvtzs z26.d, p1/m, z0.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z30.d
+; CHECK-NEXT:    fcvtzs z4.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z7.d, z24.d
+; CHECK-NEXT:    mov z24.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z27.d, p3/m, z2.d
+; CHECK-NEXT:    fcvtzs z28.d, p4/m, z3.d
+; CHECK-NEXT:    fcvtzs z29.d, p5/m, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p7/m, z5.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z30.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z2.d, z30.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z30.d
+; CHECK-NEXT:    fcvtzs z8.d, p6/m, z6.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z9.d, z26.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z5.d, z30.d
+; CHECK-NEXT:    fcvtzs z24.d, p2/m, z7.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z7.d, z30.d
+; CHECK-NEXT:    fcmuo p6.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p9.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z30.d, p4/m, z0.d
-; CHECK-NEXT:    not p4.b, p0/z, p10.b
-; CHECK-NEXT:    fcmgt p10.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    mov z26.d, p5/m, z0.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z28.d, z1.d
-; CHECK-NEXT:    mov z31.d, p6/m, z0.d
-; CHECK-NEXT:    mov z8.d, p4/m, z0.d
-; CHECK-NEXT:    sel z0.d, p3, z24.d, z4.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z5.d, z1.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z6.d, z1.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z7.d, z1.d
-; CHECK-NEXT:    sel z1.d, p7, z24.d, z29.d
-; CHECK-NEXT:    fcmuo p7.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    sel z2.d, p8, z24.d, z25.d
-; CHECK-NEXT:    sel z3.d, p10, z24.d, z27.d
-; CHECK-NEXT:    sel z4.d, p5, z24.d, z30.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    fcmuo p10.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    sel z5.d, p3, z24.d, z26.d
+; CHECK-NEXT:    sel z1.d, p4, z9.d, z4.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    sel z2.d, p5, z9.d, z27.d
+; CHECK-NEXT:    sel z3.d, p7, z9.d, z28.d
+; CHECK-NEXT:    sel z4.d, p8, z9.d, z29.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z25.d, z25.d
+; CHECK-NEXT:    fcmuo p7.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    fcmuo p8.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    sel z5.d, p1, z9.d, z31.d
+; CHECK-NEXT:    sel z6.d, p10, z9.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z7.d, z7.d
-; CHECK-NEXT:    sel z6.d, p4, z24.d, z31.d
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z7.d, p6, z24.d, z8.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sel z7.d, p2, z9.d, z24.d
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z1.d, p6/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, p9/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z3.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p4/m, #0 // =0x0
 ; CHECK-NEXT:    mov z4.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z5.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    mov z6.d, p10/m, #0 // =0x0
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z5.d, p7/m, #0 // =0x0
+; CHECK-NEXT:    mov z6.d, p8/m, #0 // =0x0
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x iXLen> @llvm.lrint.nxv16iXLen.nxv16f64(<vscale x 16 x double> %x)
@@ -1491,6 +1288,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str p11, [sp] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -1513,8 +1312,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
@@ -1527,219 +1326,176 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
 ; CHECK-NEXT:    ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT:    mov z7.d, x9
-; CHECK-NEXT:    mov z26.d, #0x8000000000000000
-; CHECK-NEXT:    ldr z3, [x0, #3, mul vl]
+; CHECK-NEXT:    ldr z6, [x0, #4, mul vl]
+; CHECK-NEXT:    mov x9, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT:    mov z25.d, x9
+; CHECK-NEXT:    mov z28.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
-; CHECK-NEXT:    movprfx z30, z2
-; CHECK-NEXT:    frintx z30.d, p0/m, z2.d
-; CHECK-NEXT:    ldr z6, [x0, #5, mul vl]
-; CHECK-NEXT:    movprfx z25, z24
-; CHECK-NEXT:    frintx z25.d, p0/m, z24.d
-; CHECK-NEXT:    movprfx z12, z1
-; CHECK-NEXT:    frintx z12.d, p0/m, z1.d
-; CHECK-NEXT:    ldr z5, [x0, #4, mul vl]
-; CHECK-NEXT:    frintx z3.d, p0/m, z3.d
-; CHECK-NEXT:    mov x9, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    frintx z4.d, p0/m, z2.d
+; CHECK-NEXT:    mov z27.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    frintx z6.d, p0/m, z6.d
-; CHECK-NEXT:    mov z4.d, x9
-; CHECK-NEXT:    fcmge p3.d, p0/z, z0.d, z7.d
-; CHECK-NEXT:    movprfx z24, z0
-; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z30.d, z7.d
-; CHECK-NEXT:    movprfx z28, z30
-; CHECK-NEXT:    fcvtzs z28.d, p0/m, z30.d
-; CHECK-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z30.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z5.d, p0/m, z5.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z12.d, z7.d
-; CHECK-NEXT:    ldr z8, [x0, #7, mul vl]
-; CHECK-NEXT:    ldr z9, [x0, #15, mul vl]
-; CHECK-NEXT:    movprfx z27, z12
-; CHECK-NEXT:    fcvtzs z27.d, p0/m, z12.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    fcmge p9.d, p0/z, z6.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p3.b
-; CHECK-NEXT:    movprfx z31, z3
-; CHECK-NEXT:    fcvtzs z31.d, p0/m, z3.d
-; CHECK-NEXT:    movprfx z15, z6
-; CHECK-NEXT:    fcvtzs z15.d, p0/m, z6.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z5.d, z7.d
-; CHECK-NEXT:    movprfx z13, z5
-; CHECK-NEXT:    fcvtzs z13.d, p0/m, z5.d
-; CHECK-NEXT:    sel z0.d, p7, z26.d, z24.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    movprfx z17, z25
-; CHECK-NEXT:    fcvtzs z17.d, p0/m, z25.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z25.d, z7.d
-; CHECK-NEXT:    movprfx z22, z9
-; CHECK-NEXT:    frintx z22.d, p0/m, z9.d
-; CHECK-NEXT:    sel z29.d, p4, z26.d, z27.d
-; CHECK-NEXT:    movprfx z27, z8
-; CHECK-NEXT:    frintx z27.d, p0/m, z8.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z12.d, z4.d
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z28.d
-; CHECK-NEXT:    not p4.b, p0/z, p8.b
-; CHECK-NEXT:    ldr z10, [x0, #8, mul vl]
-; CHECK-NEXT:    not p5.b, p0/z, p9.b
-; CHECK-NEXT:    sel z24.d, p3, z26.d, z31.d
-; CHECK-NEXT:    not p3.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z2, z22
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z22.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z30.d, z4.d
+; CHECK-NEXT:    mov z26.d, #0x8000000000000000
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    mov z13.d, #0x8000000000000000
+; CHECK-NEXT:    mov z12.d, #0x8000000000000000
+; CHECK-NEXT:    mov x10, #4890909195324358655 // =0x43dfffffffffffff
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    fcmge p7.d, p0/z, z27.d, z7.d
-; CHECK-NEXT:    sel z31.d, p5, z26.d, z15.d
-; CHECK-NEXT:    ldr z11, [x0, #9, mul vl]
-; CHECK-NEXT:    movprfx z28, z10
-; CHECK-NEXT:    frintx z28.d, p0/m, z10.d
-; CHECK-NEXT:    ldr z10, [x0, #10, mul vl]
-; CHECK-NEXT:    ldr z18, [x0, #11, mul vl]
-; CHECK-NEXT:    ldr z16, [x0, #13, mul vl]
-; CHECK-NEXT:    ldr z14, [x0, #14, mul vl]
-; CHECK-NEXT:    ldr z19, [x0, #12, mul vl]
-; CHECK-NEXT:    mov z17.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p9.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    movprfx z8, z11
-; CHECK-NEXT:    frintx z8.d, p0/m, z11.d
-; CHECK-NEXT:    sel z11.d, p4, z26.d, z13.d
-; CHECK-NEXT:    frintx z10.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z13, z18
-; CHECK-NEXT:    frintx z13.d, p0/m, z18.d
-; CHECK-NEXT:    fcmge p5.d, p0/z, z28.d, z7.d
-; CHECK-NEXT:    movprfx z18, z27
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z27.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z4.d, z25.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z25.d
+; CHECK-NEXT:    ldr z29, [x0, #7, mul vl]
+; CHECK-NEXT:    ldr z24, [x0, #6, mul vl]
+; CHECK-NEXT:    ldr z10, [x0, #9, mul vl]
+; CHECK-NEXT:    ldr z8, [x0, #8, mul vl]
+; CHECK-NEXT:    ldr z7, [x0, #5, mul vl]
+; CHECK-NEXT:    ldr z14, [x0, #15, mul vl]
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z6.d, z25.d
+; CHECK-NEXT:    ldr z15, [x0, #14, mul vl]
+; CHECK-NEXT:    frintx z29.d, p0/m, z29.d
+; CHECK-NEXT:    frintx z24.d, p0/m, z24.d
+; CHECK-NEXT:    movprfx z11, z10
+; CHECK-NEXT:    frintx z11.d, p0/m, z10.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z5.d, z25.d
+; CHECK-NEXT:    movprfx z9, z8
+; CHECK-NEXT:    frintx z9.d, p0/m, z8.d
+; CHECK-NEXT:    ldr z16, [x0, #11, mul vl]
+; CHECK-NEXT:    ldr z20, [x0, #13, mul vl]
+; CHECK-NEXT:    frintx z7.d, p0/m, z7.d
+; CHECK-NEXT:    fcvtzs z28.d, p3/m, z4.d
+; CHECK-NEXT:    mov z10.d, #0x8000000000000000
+; CHECK-NEXT:    ldr z18, [x0, #12, mul vl]
+; CHECK-NEXT:    movprfx z19, z14
+; CHECK-NEXT:    frintx z19.d, p0/m, z14.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z29.d, z25.d
+; CHECK-NEXT:    ldr z17, [x0, #10, mul vl]
+; CHECK-NEXT:    frintx z15.d, p0/m, z15.d
+; CHECK-NEXT:    fcvtzs z27.d, p2/m, z1.d
+; CHECK-NEXT:    fcvtzs z30.d, p5/m, z6.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z24.d, z25.d
+; CHECK-NEXT:    fcmge p5.d, p0/z, z11.d, z25.d
+; CHECK-NEXT:    mov z14.d, #0x8000000000000000
 ; CHECK-NEXT:    frintx z16.d, p0/m, z16.d
-; CHECK-NEXT:    movprfx z15, z19
-; CHECK-NEXT:    frintx z15.d, p0/m, z19.d
-; CHECK-NEXT:    movprfx z19, z28
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z28.d
-; CHECK-NEXT:    movprfx z21, z14
-; CHECK-NEXT:    frintx z21.d, p0/m, z14.d
-; CHECK-NEXT:    not p4.b, p0/z, p7.b
-; CHECK-NEXT:    fcmge p6.d, p0/z, z8.d, z7.d
-; CHECK-NEXT:    movprfx z20, z8
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z8.d
-; CHECK-NEXT:    fcmge p7.d, p0/z, z10.d, z7.d
-; CHECK-NEXT:    fcmge p8.d, p0/z, z13.d, z7.d
-; CHECK-NEXT:    not p5.b, p0/z, p5.b
-; CHECK-NEXT:    sel z9.d, p4, z26.d, z18.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z16.d, z7.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z15.d, z7.d
-; CHECK-NEXT:    movprfx z0, z16
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z16.d
-; CHECK-NEXT:    sel z14.d, p5, z26.d, z19.d
-; CHECK-NEXT:    movprfx z19, z10
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z10.d
-; CHECK-NEXT:    movprfx z1, z21
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z21.d
-; CHECK-NEXT:    not p6.b, p0/z, p6.b
-; CHECK-NEXT:    movprfx z23, z15
-; CHECK-NEXT:    fcvtzs z23.d, p0/m, z15.d
-; CHECK-NEXT:    not p5.b, p0/z, p7.b
-; CHECK-NEXT:    sel z18.d, p6, z26.d, z20.d
-; CHECK-NEXT:    fcmge p6.d, p0/z, z21.d, z7.d
-; CHECK-NEXT:    not p7.b, p0/z, p8.b
-; CHECK-NEXT:    fcmge p8.d, p0/z, z22.d, z7.d
-; CHECK-NEXT:    movprfx z20, z13
-; CHECK-NEXT:    fcvtzs z20.d, p0/m, z13.d
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z7.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z19.d, p5/m, z26.d
-; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    mov z0.d, p4/m, z26.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z21.d, z4.d
-; CHECK-NEXT:    not p5.b, p0/z, p6.b
-; CHECK-NEXT:    mov z23.d, p3/m, z26.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z22.d, z4.d
-; CHECK-NEXT:    not p6.b, p0/z, p8.b
-; CHECK-NEXT:    mov z20.d, p7/m, z26.d
-; CHECK-NEXT:    fcmuo p8.d, p0/z, z22.d, z22.d
-; CHECK-NEXT:    mov z1.d, p5/m, z26.d
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z21.d, z21.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z25.d, z4.d
-; CHECK-NEXT:    mov z2.d, p6/m, z26.d
-; CHECK-NEXT:    sel z26.d, p1, z7.d, z29.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z16.d, z4.d
-; CHECK-NEXT:    ldr z29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z24.d, p9/m, z7.d
-; CHECK-NEXT:    mov z1.d, p4/m, z7.d
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z16.d, z16.d
-; CHECK-NEXT:    mov z2.d, p3/m, z7.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z4.d
-; CHECK-NEXT:    mov z17.d, p7/m, z7.d
-; CHECK-NEXT:    mov z29.d, p2/m, z7.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z13.d, z4.d
-; CHECK-NEXT:    mov z0.d, p1/m, z7.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z10.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    mov z11.d, p6/m, z7.d
+; CHECK-NEXT:    frintx z20.d, p0/m, z20.d
+; CHECK-NEXT:    fcvtzs z26.d, p4/m, z5.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z9.d, z25.d
+; CHECK-NEXT:    frintx z18.d, p0/m, z18.d
+; CHECK-NEXT:    mov z31.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z2.d, p1/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    mov z8.d, #0x8000000000000000
+; CHECK-NEXT:    frintx z17.d, p0/m, z17.d
+; CHECK-NEXT:    fcvtzs z10.d, p3/m, z29.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z19.d, z25.d
+; CHECK-NEXT:    mov z3.d, x10
+; CHECK-NEXT:    fcmge p6.d, p0/z, z15.d, z25.d
+; CHECK-NEXT:    mov z21.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z13.d, p2/m, z24.d
+; CHECK-NEXT:    fcvtzs z14.d, p5/m, z11.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z16.d, z25.d
+; CHECK-NEXT:    mov z22.d, #0x8000000000000000
+; CHECK-NEXT:    fcmge p5.d, p0/z, z20.d, z25.d
+; CHECK-NEXT:    mov z0.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z12.d, p4/m, z9.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z18.d, z25.d
+; CHECK-NEXT:    mov z23.d, #0x8000000000000000
+; CHECK-NEXT:    fcvtzs z8.d, p1/m, z7.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z17.d, z25.d
+; CHECK-NEXT:    fcvtzs z31.d, p3/m, z19.d
+; CHECK-NEXT:    mov z25.d, #0x7fffffffffffffff
+; CHECK-NEXT:    fcmgt p11.d, p0/z, z19.d, z3.d
+; CHECK-NEXT:    fcvtzs z21.d, p6/m, z15.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z15.d, z3.d
 ; CHECK-NEXT:    fcmuo p6.d, p0/z, z15.d, z15.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z8.d, z4.d
-; CHECK-NEXT:    mov z2.d, p8/m, #0 // =0x0
-; CHECK-NEXT:    sel z16.d, p3, z7.d, z23.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z10.d, z10.d
-; CHECK-NEXT:    mov z0.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    sel z15.d, p2, z7.d, z20.d
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z13.d, z13.d
-; CHECK-NEXT:    str z1, [x8, #14, mul vl]
-; CHECK-NEXT:    sel z1.d, p1, z7.d, z19.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z28.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z27.d, z4.d
-; CHECK-NEXT:    str z2, [x8, #15, mul vl]
-; CHECK-NEXT:    sel z2.d, p5, z7.d, z18.d
-; CHECK-NEXT:    mov z16.d, p6/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z8.d, z8.d
+; CHECK-NEXT:    mov z15.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcvtzs z22.d, p2/m, z16.d
+; CHECK-NEXT:    fcvtzs z0.d, p5/m, z20.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z24.d, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z20.d, z3.d
+; CHECK-NEXT:    fcvtzs z23.d, p4/m, z18.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z19.d, z19.d
+; CHECK-NEXT:    mov z31.d, p11/m, z25.d
+; CHECK-NEXT:    sel z19.d, p3, z25.d, z21.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z18.d, z3.d
+; CHECK-NEXT:    fcvtzs z15.d, p1/m, z17.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z20.d, z20.d
+; CHECK-NEXT:    mov z27.d, p7/m, z25.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z16.d, z3.d
+; CHECK-NEXT:    mov z13.d, p2/m, z25.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z17.d, z3.d
+; CHECK-NEXT:    mov z0.d, p5/m, z25.d
+; CHECK-NEXT:    mov z31.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z18.d, z18.d
+; CHECK-NEXT:    sel z20.d, p3, z25.d, z23.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z16.d, z16.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z11.d, z3.d
+; CHECK-NEXT:    mov z19.d, p6/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p10.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmgt p8.d, p0/z, z4.d, z3.d
+; CHECK-NEXT:    str z31, [x8, #15, mul vl]
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z17.d, z17.d
+; CHECK-NEXT:    sel z18.d, p7, z25.d, z22.d
+; CHECK-NEXT:    sel z31.d, p2, z25.d, z15.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z9.d, z3.d
+; CHECK-NEXT:    str z19, [x8, #14, mul vl]
+; CHECK-NEXT:    mov z20.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z11.d, z11.d
 ; CHECK-NEXT:    str z0, [x8, #13, mul vl]
-; CHECK-NEXT:    mov z15.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p2.d, p0/z, z28.d, z28.d
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z4.d
-; CHECK-NEXT:    sel z0.d, p1, z7.d, z14.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z27.d, z27.d
-; CHECK-NEXT:    sel z27.d, p4, z7.d, z9.d
-; CHECK-NEXT:    str z16, [x8, #12, mul vl]
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z25.d, z25.d
-; CHECK-NEXT:    str z15, [x8, #11, mul vl]
-; CHECK-NEXT:    mov z2.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z6.d, z6.d
-; CHECK-NEXT:    str z1, [x8, #10, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z1.d, p3, z7.d, z31.d
-; CHECK-NEXT:    fcmuo p3.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    ldr z5, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z27.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z2, [x8, #9, mul vl]
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z14.d, p5/m, z25.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z29.d, z3.d
+; CHECK-NEXT:    mov z18.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z31.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z7.d, z3.d
+; CHECK-NEXT:    str z20, [x8, #12, mul vl]
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z9.d, z9.d
+; CHECK-NEXT:    sel z0.d, p2, z25.d, z12.d
+; CHECK-NEXT:    mov z14.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z7.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z29.d, z29.d
+; CHECK-NEXT:    str z18, [x8, #11, mul vl]
+; CHECK-NEXT:    sel z29.d, p5, z25.d, z10.d
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z24.d, z24.d
+; CHECK-NEXT:    str z31, [x8, #10, mul vl]
+; CHECK-NEXT:    sel z7.d, p1, z25.d, z8.d
+; CHECK-NEXT:    fcmuo p1.d, p0/z, z6.d, z6.d
+; CHECK-NEXT:    ldr z6, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    str z14, [x8, #9, mul vl]
+; CHECK-NEXT:    fcmgt p9.d, p0/z, z5.d, z3.d
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z29.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z5.d, z5.d
+; CHECK-NEXT:    mov z13.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    fcmuo p5.d, p0/z, z4.d, z4.d
+; CHECK-NEXT:    mov z7.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z6.d, z3.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z6.d, z6.d
 ; CHECK-NEXT:    str z0, [x8, #8, mul vl]
-; CHECK-NEXT:    mov z17.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p4.d, p0/z, z30.d, z30.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z5.d, z4.d
-; CHECK-NEXT:    mov z1.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p5.d, p0/z, z12.d, z12.d
-; CHECK-NEXT:    str z27, [x8, #7, mul vl]
-; CHECK-NEXT:    fcmuo p0.d, p0/z, z5.d, z5.d
-; CHECK-NEXT:    mov z11.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    str z17, [x8, #6, mul vl]
-; CHECK-NEXT:    mov z29.d, p4/m, #0 // =0x0
-; CHECK-NEXT:    str z1, [x8, #5, mul vl]
-; CHECK-NEXT:    mov z26.d, p5/m, #0 // =0x0
-; CHECK-NEXT:    str z11, [x8, #4, mul vl]
-; CHECK-NEXT:    mov z0.d, p2/m, z7.d
-; CHECK-NEXT:    str z24, [x8, #3, mul vl]
-; CHECK-NEXT:    str z29, [x8, #2, mul vl]
-; CHECK-NEXT:    str z26, [x8, #1, mul vl]
+; CHECK-NEXT:    mov z28.d, p8/m, z25.d
+; CHECK-NEXT:    mov z26.d, p9/m, z25.d
+; CHECK-NEXT:    str z29, [x8, #7, mul vl]
+; CHECK-NEXT:    mov z30.d, p10/m, z25.d
+; CHECK-NEXT:    str z13, [x8, #6, mul vl]
+; CHECK-NEXT:    str z7, [x8, #5, mul vl]
+; CHECK-NEXT:    sel z0.d, p3, z25.d, z2.d
+; CHECK-NEXT:    mov z26.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z30.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z28.d, p5/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    str z26, [x8, #3, mul vl]
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    str z30, [x8, #4, mul vl]
+; CHECK-NEXT:    str z28, [x8, #2, mul vl]
+; CHECK-NEXT:    str z27, [x8, #1, mul vl]
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1756,6 +1512,8 @@ define <vscale x 32 x iXLen> @lrint_v32f64(<vscale x 32 x double> %x) {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
index bbc94f568dd0a..0c0762da5bba2 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
@@ -989,9 +989,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fadd z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1004,9 +1004,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fadd z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1019,9 +1019,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1034,9 +1034,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fsub z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1049,9 +1049,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fsub z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1064,9 +1064,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fsub z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1079,9 +1079,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1094,9 +1094,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmul z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1109,9 +1109,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1125,9 +1125,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1141,9 +1140,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1157,9 +1155,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1173,8 +1170,8 @@ define <vscale x 4 x float> @minnum_nxv4f32_x(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1188,8 +1185,8 @@ define <vscale x 8 x half> @minnum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1203,8 +1200,8 @@ define <vscale x 2 x double> @minnum_nxv2f64_x(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1218,8 +1215,8 @@ define <vscale x 4 x float> @maxnum_nxv4f32_x(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1233,8 +1230,8 @@ define <vscale x 8 x half> @maxnum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1248,8 +1245,8 @@ define <vscale x 2 x double> @maxnum_nxv2f64_x(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1263,8 +1260,8 @@ define <vscale x 4 x float> @minimum_nxv4f32_x(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1278,8 +1275,8 @@ define <vscale x 8 x half> @minimum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1293,8 +1290,8 @@ define <vscale x 2 x double> @minimum_nxv2f64_x(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1308,8 +1305,8 @@ define <vscale x 4 x float> @maximum_nxv4f32_x(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1323,8 +1320,8 @@ define <vscale x 8 x half> @maximum_nxv8f16_x(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1338,8 +1335,8 @@ define <vscale x 2 x double> @maximum_nxv2f64_x(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1353,8 +1350,8 @@ define <vscale x 4 x float> @fmai_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1368,8 +1365,8 @@ define <vscale x 8 x half> @fmai_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1383,8 +1380,8 @@ define <vscale x 2 x double> @fmai_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1398,8 +1395,8 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1414,8 +1411,8 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1430,8 +1427,8 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2470,9 +2467,8 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2486,9 +2482,8 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2502,9 +2497,8 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2517,10 +2511,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2533,10 +2526,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2549,10 +2541,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2566,9 +2557,8 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2582,9 +2572,8 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fmul z0.h, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2598,9 +2587,8 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmul z0.d, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2614,9 +2602,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2630,9 +2617,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2646,9 +2632,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2662,9 +2647,8 @@ define <vscale x 4 x float> @minnum_nxv4f32_y(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2678,9 +2662,8 @@ define <vscale x 8 x half> @minnum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2694,9 +2677,8 @@ define <vscale x 2 x double> @minnum_nxv2f64_y(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2710,9 +2692,8 @@ define <vscale x 4 x float> @maxnum_nxv4f32_y(<vscale x 4 x float> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2726,9 +2707,8 @@ define <vscale x 8 x half> @maxnum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2742,9 +2722,8 @@ define <vscale x 2 x double> @maxnum_nxv2f64_y(<vscale x 2 x double> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2758,9 +2737,8 @@ define <vscale x 4 x float> @minimum_nxv4f32_y(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2774,9 +2752,8 @@ define <vscale x 8 x half> @minimum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2790,9 +2767,8 @@ define <vscale x 2 x double> @minimum_nxv2f64_y(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2806,9 +2782,8 @@ define <vscale x 4 x float> @maximum_nxv4f32_y(<vscale x 4 x float> %x, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2822,9 +2797,8 @@ define <vscale x 8 x half> @maximum_nxv8f16_y(<vscale x 8 x half> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2838,9 +2812,8 @@ define <vscale x 2 x double> @maximum_nxv2f64_y(<vscale x 2 x double> %x, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2855,8 +2828,7 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2871,8 +2843,7 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2887,8 +2858,7 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2903,8 +2873,7 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2920,8 +2889,7 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2937,8 +2905,7 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 66dece82a0ac5..58d6149b94d3a 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -641,9 +641,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fadd z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -656,9 +656,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fadd z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -671,9 +671,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -686,9 +686,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fsub z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -701,9 +701,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fsub z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -716,9 +716,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fsub z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -731,9 +731,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -746,9 +746,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fmul z1.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -761,9 +761,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -777,9 +777,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -793,9 +792,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -809,9 +807,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -825,8 +822,8 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    fmad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -841,8 +838,8 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    fmad z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -857,8 +854,8 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1540,10 +1537,9 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fadd_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1556,10 +1552,9 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fadd_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.h, z1.h, z0.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1572,10 +1567,9 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fadd_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1588,10 +1582,9 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fsub_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1604,10 +1597,9 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fsub_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1620,10 +1612,9 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fsub_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fsubr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1636,10 +1627,9 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmul_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.s, z1.s, z0.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1652,10 +1642,9 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmul_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.h, z1.h, z0.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1668,10 +1657,9 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmul_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmul z0.d, z1.d, z0.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1685,9 +1673,8 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1701,9 +1688,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcmle p0.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1717,9 +1703,8 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fcmle p0.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1734,8 +1719,7 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1750,8 +1734,7 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1766,8 +1749,7 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1782,8 +1764,7 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1799,8 +1780,7 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1816,8 +1796,7 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 2e993a85760c6..8a84d3ca2328c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -27,9 +27,6 @@ define half @add_v2HalfH(<2 x half> %bin.rdx)  {
 ; CHECK-SD-FP16-LABEL: add_v2HalfH:
 ; CHECK-SD-FP16:       // %bb.0:
 ; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT:    mov v0.h[2], wzr
-; CHECK-SD-FP16-NEXT:    mov v0.h[3], wzr
-; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
 ; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
 ; CHECK-SD-FP16-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
index ff89cc21b56da..43f7cd96a3b48 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -1,14 +1,67 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
-
-; GCN-LABEL: {{^}}fmuladd_f64:
-; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fmuladd_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
-                         ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-STRICT %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-CONTRACT %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-STRICT %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-CONTRACT %s
+
+define amdgpu_kernel void @fmuladd_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-LABEL: fmuladd_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmuladd_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; VI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT:    s_endpgm
   %r0 = load double, ptr addrspace(1) %in1
   %r1 = load double, ptr addrspace(1) %in2
   %r2 = load double, ptr addrspace(1) %in3
@@ -17,13 +70,122 @@ define amdgpu_kernel void @fmuladd_f64(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_fadd_f64:
-; GCN-CONTRACT: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-
-; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fmul_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
-                           ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmul_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-STRICT-LABEL: fmul_fadd_f64:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s11, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s10, -1
+; SI-STRICT-NEXT:    s_mov_b32 s14, s10
+; SI-STRICT-NEXT:    s_mov_b32 s15, s11
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    s_mov_b32 s12, s2
+; SI-STRICT-NEXT:    s_mov_b32 s13, s3
+; SI-STRICT-NEXT:    s_mov_b32 s16, s4
+; SI-STRICT-NEXT:    s_mov_b32 s17, s5
+; SI-STRICT-NEXT:    s_mov_b32 s18, s10
+; SI-STRICT-NEXT:    s_mov_b32 s19, s11
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-STRICT-NEXT:    s_mov_b32 s4, s6
+; SI-STRICT-NEXT:    s_mov_b32 s5, s7
+; SI-STRICT-NEXT:    s_mov_b32 s6, s10
+; SI-STRICT-NEXT:    s_mov_b32 s7, s11
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; SI-STRICT-NEXT:    s_mov_b32 s8, s0
+; SI-STRICT-NEXT:    s_mov_b32 s9, s1
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(1)
+; SI-STRICT-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: fmul_fadd_f64:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s11, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s10, -1
+; SI-CONTRACT-NEXT:    s_mov_b32 s14, s10
+; SI-CONTRACT-NEXT:    s_mov_b32 s15, s11
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    s_mov_b32 s12, s2
+; SI-CONTRACT-NEXT:    s_mov_b32 s13, s3
+; SI-CONTRACT-NEXT:    s_mov_b32 s16, s4
+; SI-CONTRACT-NEXT:    s_mov_b32 s17, s5
+; SI-CONTRACT-NEXT:    s_mov_b32 s18, s10
+; SI-CONTRACT-NEXT:    s_mov_b32 s19, s11
+; SI-CONTRACT-NEXT:    s_mov_b32 s4, s6
+; SI-CONTRACT-NEXT:    s_mov_b32 s5, s7
+; SI-CONTRACT-NEXT:    s_mov_b32 s6, s10
+; SI-CONTRACT-NEXT:    s_mov_b32 s7, s11
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; SI-CONTRACT-NEXT:    s_mov_b32 s8, s0
+; SI-CONTRACT-NEXT:    s_mov_b32 s9, s1
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: fmul_fadd_f64:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-STRICT-NEXT:    s_mov_b32 s11, 0xf000
+; VI-STRICT-NEXT:    s_mov_b32 s10, -1
+; VI-STRICT-NEXT:    s_mov_b32 s14, s10
+; VI-STRICT-NEXT:    s_mov_b32 s15, s11
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    s_mov_b32 s12, s2
+; VI-STRICT-NEXT:    s_mov_b32 s13, s3
+; VI-STRICT-NEXT:    s_mov_b32 s16, s4
+; VI-STRICT-NEXT:    s_mov_b32 s17, s5
+; VI-STRICT-NEXT:    s_mov_b32 s18, s10
+; VI-STRICT-NEXT:    s_mov_b32 s19, s11
+; VI-STRICT-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; VI-STRICT-NEXT:    s_mov_b32 s4, s6
+; VI-STRICT-NEXT:    s_mov_b32 s5, s7
+; VI-STRICT-NEXT:    s_mov_b32 s6, s10
+; VI-STRICT-NEXT:    s_mov_b32 s7, s11
+; VI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; VI-STRICT-NEXT:    s_mov_b32 s8, s0
+; VI-STRICT-NEXT:    s_mov_b32 s9, s1
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(1)
+; VI-STRICT-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; VI-STRICT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: fmul_fadd_f64:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    s_mov_b32 s11, 0xf000
+; VI-CONTRACT-NEXT:    s_mov_b32 s10, -1
+; VI-CONTRACT-NEXT:    s_mov_b32 s14, s10
+; VI-CONTRACT-NEXT:    s_mov_b32 s15, s11
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    s_mov_b32 s12, s2
+; VI-CONTRACT-NEXT:    s_mov_b32 s13, s3
+; VI-CONTRACT-NEXT:    s_mov_b32 s16, s4
+; VI-CONTRACT-NEXT:    s_mov_b32 s17, s5
+; VI-CONTRACT-NEXT:    s_mov_b32 s18, s10
+; VI-CONTRACT-NEXT:    s_mov_b32 s19, s11
+; VI-CONTRACT-NEXT:    s_mov_b32 s4, s6
+; VI-CONTRACT-NEXT:    s_mov_b32 s5, s7
+; VI-CONTRACT-NEXT:    s_mov_b32 s6, s10
+; VI-CONTRACT-NEXT:    s_mov_b32 s7, s11
+; VI-CONTRACT-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; VI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; VI-CONTRACT-NEXT:    s_mov_b32 s8, s0
+; VI-CONTRACT-NEXT:    s_mov_b32 s9, s1
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-CONTRACT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-CONTRACT-NEXT:    s_endpgm
   %r0 = load double, ptr addrspace(1) %in1
   %r1 = load double, ptr addrspace(1) %in2
   %r2 = load double, ptr addrspace(1) %in3
@@ -33,11 +195,62 @@ define amdgpu_kernel void @fmul_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_fadd_contract_f64:
-; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-
-define amdgpu_kernel void @fmul_fadd_contract_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
-                           ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmul_fadd_contract_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-LABEL: fmul_fadd_contract_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmul_fadd_contract_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; VI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT:    s_endpgm
   %r0 = load double, ptr addrspace(1) %in1
   %r1 = load double, ptr addrspace(1) %in2
   %r2 = load double, ptr addrspace(1) %in3
@@ -47,20 +260,76 @@ define amdgpu_kernel void @fmul_fadd_contract_f64(ptr addrspace(1) %out, ptr add
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_a_a_b_f64:
-; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]],
-; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]],
-
-; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]]
-; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]], [[R2]]
-
-; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]]
-
-; SI: buffer_store_dwordx2 [[RESULT]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @fadd_a_a_b_f64(ptr addrspace(1) %out,
-                            ptr addrspace(1) %in1,
-                            ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_a_a_b_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-STRICT-LABEL: fadd_a_a_b_f64:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s2, 0
+; SI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-STRICT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: fadd_a_a_b_f64:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s2, 0
+; SI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-CONTRACT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[4:5]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: fadd_a_a_b_f64:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; VI-STRICT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: fadd_a_a_b_f64:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[4:5], 2.0, v[2:3]
+; VI-CONTRACT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -75,20 +344,76 @@ define amdgpu_kernel void @fadd_a_a_b_f64(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_b_a_a_f64:
-; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]],
-; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]],
-
-; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]]
-; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R2]], [[TMP]]
-
-; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]]
-
-; SI: buffer_store_dwordx2 [[RESULT]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @fadd_b_a_a_f64(ptr addrspace(1) %out,
-                            ptr addrspace(1) %in1,
-                            ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_b_a_a_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-STRICT-LABEL: fadd_b_a_a_f64:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s2, 0
+; SI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-STRICT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: fadd_b_a_a_f64:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s2, 0
+; SI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-CONTRACT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[4:5]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: fadd_b_a_a_f64:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; VI-STRICT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: fadd_b_a_a_f64:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[4:5], 2.0, v[2:3]
+; VI-CONTRACT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -103,12 +428,98 @@ define amdgpu_kernel void @fadd_b_a_a_f64(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_sub_f64:
-; GCN-STRICT: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-
-; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @mad_sub_f64(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; SI-STRICT-LABEL: mad_sub_f64:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s7, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s6, 0
+; SI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-STRICT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-STRICT-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: mad_sub_f64:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s7, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s6, 0
+; SI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-CONTRACT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -v[6:7]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: mad_sub_f64:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-STRICT-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-STRICT-NEXT:    v_add_f64 v[0:1], v[2:3], -v[0:1]
+; VI-STRICT-NEXT:    v_mov_b32_e32 v3, s1
+; VI-STRICT-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
+; VI-STRICT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-STRICT-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: mad_sub_f64:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v3, s1
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-CONTRACT-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -126,14 +537,76 @@ define amdgpu_kernel void @mad_sub_f64(ptr addrspace(1) noalias nocapture %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add0:
-; GCN-STRICT: v_add_f64
-; GCN-STRICT: v_add_f64
-
-; GCN-CONTRACT: v_fma_f64
-define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(ptr addrspace(1) %out,
-                                      ptr addrspace(1) %in1,
-                                      ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-STRICT-LABEL: fadd_a_a_b_f64_fast_add0:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s2, 0
+; SI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-STRICT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: fadd_a_a_b_f64_fast_add0:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s2, 0
+; SI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-CONTRACT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[4:5]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: fadd_a_a_b_f64_fast_add0:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; VI-STRICT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: fadd_a_a_b_f64_fast_add0:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[4:5], 2.0, v[2:3]
+; VI-CONTRACT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -148,14 +621,76 @@ define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add1:
-; GCN-STRICT: v_add_f64
-; GCN-STRICT: v_add_f64
-
-; GCN-CONTRACT: v_fma_f64
-define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(ptr addrspace(1) %out,
-                                      ptr addrspace(1) %in1,
-                                      ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-STRICT-LABEL: fadd_a_a_b_f64_fast_add1:
+; SI-STRICT:       ; %bb.0:
+; SI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-STRICT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-STRICT-NEXT:    s_mov_b32 s2, 0
+; SI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-STRICT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; SI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; SI-STRICT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-STRICT-NEXT:    s_endpgm
+;
+; SI-CONTRACT-LABEL: fadd_a_a_b_f64_fast_add1:
+; SI-CONTRACT:       ; %bb.0:
+; SI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-CONTRACT-NEXT:    s_mov_b32 s3, 0xf000
+; SI-CONTRACT-NEXT:    s_mov_b32 s2, 0
+; SI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-CONTRACT-NEXT:    v_mov_b32_e32 v1, 0
+; SI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[4:5]
+; SI-CONTRACT-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-CONTRACT-NEXT:    s_endpgm
+;
+; VI-STRICT-LABEL: fadd_a_a_b_f64_fast_add1:
+; VI-STRICT:       ; %bb.0:
+; VI-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-STRICT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-STRICT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; VI-STRICT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-STRICT-NEXT:    flat_load_dwordx2 v[4:5], v[4:5] glc
+; VI-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[2:3]
+; VI-STRICT-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
+; VI-STRICT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-STRICT-NEXT:    s_endpgm
+;
+; VI-CONTRACT-LABEL: fadd_a_a_b_f64_fast_add1:
+; VI-CONTRACT:       ; %bb.0:
+; VI-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[4:5], 2.0, v[2:3]
+; VI-CONTRACT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
@@ -170,11 +705,40 @@ define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
-; GCN: v_fma_f64
-define amdgpu_kernel void @fadd_a_a_b_f64_fast(ptr addrspace(1) %out,
-                                 ptr addrspace(1) %in1,
-                                ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-LABEL: fadd_a_a_b_f64_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[4:5]
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fadd_a_a_b_f64_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_fma_f64 v[2:3], v[4:5], 2.0, v[2:3]
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
index 8d44330b1b973..6f6ff96a1a196 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
@@ -1,4 +1,3 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 70ea0688c8a49..d8079651787ad 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -23,9 +23,9 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GCN-SDAG-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GCN-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GCN-SDAG-NEXT:    global_store_b32 v[8:9], v0, off
@@ -41,8 +41,7 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
 ; GCN-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GCN-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GCN-GISEL-NEXT:    v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_lshlrev_b32 v2, 24, v2
 ; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GCN-GISEL-NEXT:    global_store_b32 v[8:9], v0, off
@@ -66,17 +65,15 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[2:3], off
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v10, v6, v2
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v11, v7, v3
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 12
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v6, 8
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v2, 12 :: v_dual_mov_b32 v6, 8
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v7, 0
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v4, v4, v0
-; GCN-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_lshrrev_b32 v0, 16, v10
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v5, v5, v1
 ; GCN-SDAG-NEXT:    s_clause 0x2
 ; GCN-SDAG-NEXT:    global_store_b16 v[2:3], v11, off
@@ -90,18 +87,19 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[2:3], off
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 4
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v10, 2
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v11, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v12, 4 :: v_dual_mov_b32 v14, 6
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v15, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v16, 8 :: v_dual_mov_b32 v18, 10
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v19, 0
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v2, v6, v2
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v4, v4, v0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v20, 12
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_lshrrev_b32 v0, 16, v2
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v3, v7, v3
 ; GCN-GISEL-NEXT:    s_clause 0x6
 ; GCN-GISEL-NEXT:    global_store_b16 v[8:9], v4, off
@@ -111,7 +109,6 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
 ; GCN-GISEL-NEXT:    global_store_b16 v[16:17], v2, off
 ; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[18:19], v2, off
 ; GCN-GISEL-NEXT:    global_store_b16 v[20:21], v3, off
-; GCN-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1
   %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
@@ -319,14 +316,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v16, 0x70
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v50, 0x60
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v51, 0 :: v_dual_mov_b32 v52, 48
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v38, 0x50 :: v_dual_mov_b32 v53, 0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v54, 32
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 64
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v40, 16
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v49, 0
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v38, 0x50 :: v_dual_mov_b32 v53, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v14, 0xc8
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v39, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v48, 64 :: v_dual_mov_b32 v40, 16
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v49, 0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v43, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GCN-SDAG-NEXT:    global_store_b128 v[16:17], v[6:9], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
@@ -408,15 +404,16 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v34, 0xc8
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v35, 0 :: v_dual_mov_b32 v38, 0
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v48, 16
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v49, 0 :: v_dual_mov_b32 v50, 32
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v52, 48 :: v_dual_mov_b32 v51, 0
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v53, 0 :: v_dual_mov_b32 v54, 64
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v40, 0x50 :: v_dual_mov_b32 v55, 0
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 0x60
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v50, 32 :: v_dual_mov_b32 v49, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v51, 0 :: v_dual_mov_b32 v52, 48
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v54, 64
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v40, 0x50 :: v_dual_mov_b32 v53, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v41, 0
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v42, 0x60
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v44, 0x70 :: v_dual_mov_b32 v43, 0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v45, 0
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x7
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v36, v8
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v45, 0 :: v_dual_mov_b32 v37, v9
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v36, v8
 ; GCN-GISEL-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8
 ; GCN-GISEL-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9]
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
@@ -491,12 +488,11 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v8, 12
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 8
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -520,17 +516,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GCN-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 2
-; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v4, 4, v0
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
-; GCN-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v12, 4
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 6
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 8
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 10
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v20, 12 :: v_dual_mov_b32 v19, 0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v20, 12
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v21, 0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
index da537e9676ca9..bdec2c8545c7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
@@ -1,5 +1,3 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
index 234014fac9f5e..79288d76b414a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
@@ -1,5 +1,3 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
new file mode 100644
index 0000000000000..2173d07baa57e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
+
+define amdgpu_ps void @test_asynccnt() {
+; GFX12-LABEL: test_asynccnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_asynccnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_tensorcnt() {
+; GFX12-LABEL: test_tensorcnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_tensorcnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.wait.asynccnt(i16)
+declare void @llvm.amdgcn.s.wait.tensorcnt(i16)
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 1dd08c561b2ab..8b7102582c2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,246 +1,872 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
+; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
 
 ; Testing for ds_read/write_b128
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-DS128 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-DS128 %s
 
-; FUNC-LABEL: {{^}}local_load_i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16 v{{[0-9]+}}
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b16 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b16 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b16 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_SHORT_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load i16, ptr addrspace(3) %in
   store i16 %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v2i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v2i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v2i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v2i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v2i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <2 x i16>, ptr addrspace(3) %in
   store <2 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v3i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b16
-
-; EG-DAG: LDS_USHORT_READ_RET
-; EG-DAG: LDS_USHORT_READ_RET
 define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v3i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v2, v0
+; SI-NEXT:    ds_write_b16 v2, v1 offset:4
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v3i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b16 v2, v1 offset:4
+; VI-NEXT:    ds_write_b32 v2, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v3i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b16 v2, v1 offset:4
+; GFX9-NEXT:    ds_write_b32 v2, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v3i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 11, @2, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_SHORT_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   store <3 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v4i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v4i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v4i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v4i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v4i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 11, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <4 x i16>, ptr addrspace(3) %in
   store <4 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v8i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v8i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_load_v8i16:
+; VI-NO-DS128:       ; %bb.0: ; %entry
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_load_v8i16:
+; GFX9-NO-DS128:       ; %bb.0: ; %entry
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v8i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 25, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_load_v8i16:
+; VI-DS128:       ; %bb.0: ; %entry
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_load_v8i16:
+; GFX9-DS128:       ; %bb.0: ; %entry
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i16>, ptr addrspace(3) %in
   store <8 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v16i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v16i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v8, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_load_v16i16:
+; VI-NO-DS128:       ; %bb.0: ; %entry
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_load_v16i16:
+; GFX9-NO-DS128:       ; %bb.0: ; %entry
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v16i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 53, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_load_v16i16:
+; VI-DS128:       ; %bb.0: ; %entry
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_load_v16i16:
+; GFX9-DS128:       ; %bb.0: ; %entry
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i16>, ptr addrspace(3) %in
   store <16 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-; GCN: ds_write_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_i16_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 4, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i32
   store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
-; GCN-NOT: s_wqm_b64
-
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_i16_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_i16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_i16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 6, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i32
   store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v1i16_to_v1i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 4, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v1i16_to_v1i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_i16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_i16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v2i16_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v2i16_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
-; EG: BFE_INT
-; EG: BFE_INT
 define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v2i16_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v2i16_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     LSHR * T0.W, PV.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
 define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_zextload_v3i16_to_v3i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; SI-NEXT:    ds_write_b32 v4, v0 offset:8
+; SI-NEXT:    ds_write_b64 v4, v[2:3]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_local_zextload_v3i16_to_v3i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b96 v3, v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_zextload_v3i16_to_v3i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 18, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -248,23 +874,79 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_sextload_v3i16_to_v3i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; SI-NEXT:    ds_write_b32 v4, v0 offset:8
+; SI-NEXT:    ds_write_b64 v4, v[2:3]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_local_sextload_v3i16_to_v3i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[3:4], v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
+; VI-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; VI-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    ds_write_b96 v3, v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[3:4], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
+; GFX9-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX9-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_sextload_v3i16_to_v3i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 22, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -272,659 +954,7978 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_local_zextload_v4i16_to_v4i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_zextload_v4i16_to_v4i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 22, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v0
+; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 25, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     LSHR * T0.W, T0.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[4:5], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v2, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[4:5], v0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v4, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v12, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v2, v[0:1], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v2, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v10, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 46, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; VI-DS128-NEXT:    ds_write_b128 v0, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v12, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 51, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     LSHR T1.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T2.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v6
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 94, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     MOV * T3.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 95, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR * T3.Z, T2.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T2.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    ALU 7, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; VI-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v13
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_mov_b32_e32 v32, s0
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 105, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     MOV * T4.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:    ALU 84, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v20, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v20
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v20 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v20 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v20 offset:48
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v23
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v22
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; VI-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; VI-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3]
+; VI-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v20
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v20 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v20 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v20 offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v23
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v22
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX9-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3]
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v2
+; SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v18, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v20, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v22, v2, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v4
+; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v7
+; SI-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v9
+; SI-NEXT:    v_bfe_i32 v24, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v11
+; SI-NEXT:    v_bfe_i32 v26, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v13
+; SI-NEXT:    v_bfe_i32 v28, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
+; SI-NEXT:    v_bfe_i32 v30, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s0
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v30, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v26, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v28, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v30, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v26, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v28, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 101, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     LSHR * T5.W, T4.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.Z, T4.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T1.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR * T6.Z, T2.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 89, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T2.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T3.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T3.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T3.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T4.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T5.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T5.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T4.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T4.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T4.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT:    ALU 16, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v24
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v24 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v1
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v30, 16, v21
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v28, 16, v20
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v37, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v35, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v29, v21, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v27, v20, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v33, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v31, v6, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[35:38] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v4, v[31:34] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v4, v[27:30] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v4, v[23:26] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v24 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v30, 16, v21
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v28, 16, v20
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v37, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v35, v4, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v29, v21, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v27, v20, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v33, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v31, v6, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[35:38] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[31:34] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[27:30] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[23:26] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v64i16_to_v64i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NEXT:    s_add_u32 s12, s12, s11
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v24, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
+; SI-NEXT:    ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
+; SI-NEXT:    ds_read2_b64 v[20:23], v24 offset1:1
+; SI-NEXT:    ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[34:37], v24 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[38:41], v24 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(7)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; SI-NEXT:    buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v15
+; SI-NEXT:    v_and_b32_e32 v42, 0xffff, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v9
+; SI-NEXT:    v_and_b32_e32 v44, 0xffff, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v11
+; SI-NEXT:    v_and_b32_e32 v46, 0xffff, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; SI-NEXT:    v_and_b32_e32 v48, 0xffff, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
+; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v17
+; SI-NEXT:    v_and_b32_e32 v52, 0xffff, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; SI-NEXT:    v_and_b32_e32 v54, 0xffff, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v35
+; SI-NEXT:    v_and_b32_e32 v56, 0xffff, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v37
+; SI-NEXT:    v_and_b32_e32 v58, 0xffff, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v36
+; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
+; SI-NEXT:    v_and_b32_e32 v60, 0xffff, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v38
+; SI-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v41
+; SI-NEXT:    v_and_b32_e32 v62, 0xffff, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v40
+; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v40
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v16 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v20
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v25, 0xffff, v22
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v27, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v30, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v31, 0xffff, v23
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v20
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v37, 0xffff, v20
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v22
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v23
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v55, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v20
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v59, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v62, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v61, 0xffff, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v56, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v56 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v14
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v40, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v43, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v44, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v46, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v50, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v53, 16, v23
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v52, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v54, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v63, 16, v23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v62, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v64i16_to_v64i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 116, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.W, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Y, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Z, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.W, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Y, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Z, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.W, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Y, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Z, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.W, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Y, OQAP,
+; EG-NEXT:     MOV * T9.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Z, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU 95, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.W, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Y, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Z, OQAP,
+; EG-NEXT:     LSHR T10.W, T10.Y, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T10.W, T10.Y, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T10.W, T10.Z, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T10.W, T10.Z, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T10.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T9.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T9.Z, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T9.W, T9.Z, literal.x,
+; EG-NEXT:     MOV * T10.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T9.Y, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T9.W, T9.Y, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T8.W, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T8.W, T8.W, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T8.Z, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T8.W, T8.Z, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T8.Y, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T8.W, T8.Y, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T7.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T7.Z, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT T7.W, T7.Z, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T7.Y, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT * T7.W, T7.Y, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:    ALU 93, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T6.W, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT T6.W, T6.W, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T6.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T6.W, T6.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T6.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T6.W, T6.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T5.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T5.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T5.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT:    ALU 76, @29, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[8:11], v0
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
+; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v23
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v21
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v20
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
+; VI-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; VI-DS128-NEXT:    v_mov_b32_e32 v31, v15
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v27
+; VI-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v11
+; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v10
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v8
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
+; VI-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
+; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
+; VI-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; VI-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v58
+; VI-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v57
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v56
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v55
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v27
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v26
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v24, v[4:7] offset:240
+; VI-DS128-NEXT:    ds_write_b128 v24, v[8:11] offset:192
+; VI-DS128-NEXT:    ds_write_b128 v24, v[58:61] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v24, v[51:54] offset:160
+; VI-DS128-NEXT:    ds_write_b128 v24, v[47:50] offset:176
+; VI-DS128-NEXT:    ds_write_b128 v24, v[43:46] offset:128
+; VI-DS128-NEXT:    ds_write_b128 v24, v[39:42] offset:144
+; VI-DS128-NEXT:    ds_write_b128 v24, v[32:35] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v24, v[20:23] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v24, v[16:19] offset:64
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:80
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:32
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v24, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v24, v[28:31] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v0
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
+; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v23
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v21
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v20
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
+; GFX9-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v27
+; GFX9-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v11
+; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v8
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
+; GFX9-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
+; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
+; GFX9-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX9-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v58
+; GFX9-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v57
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v56
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v55
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v27
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v26
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[4:7] offset:240
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[8:11] offset:192
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[58:61] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[51:54] offset:160
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[47:50] offset:176
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[43:46] offset:128
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[39:42] offset:144
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[32:35] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[20:23] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[16:19] offset:64
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:80
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:32
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[28:31] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v64i16_to_v64i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NEXT:    s_add_u32 s12, s12, s11
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v20, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[4:7], v20 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[0:3], v20 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[8:11], v20 offset0:12 offset1:13
+; SI-NEXT:    ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
+; SI-NEXT:    ds_read2_b64 v[16:19], v20 offset1:1
+; SI-NEXT:    ds_read2_b64 v[30:33], v20 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[34:37], v20 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[38:41], v20 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(7)
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v1
+; SI-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v24, v7, 0, 16
+; SI-NEXT:    v_bfe_i32 v26, v6, 0, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v3
+; SI-NEXT:    v_bfe_i32 v28, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v20, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v3, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v2
+; SI-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
+; SI-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v11
+; SI-NEXT:    v_bfe_i32 v42, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v13
+; SI-NEXT:    v_bfe_i32 v44, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v15
+; SI-NEXT:    v_bfe_i32 v46, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v17
+; SI-NEXT:    v_bfe_i32 v48, v17, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v16
+; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v19
+; SI-NEXT:    v_bfe_i32 v50, v19, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v18
+; SI-NEXT:    v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v31
+; SI-NEXT:    v_bfe_i32 v52, v31, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v30
+; SI-NEXT:    v_bfe_i32 v30, v30, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v33
+; SI-NEXT:    v_bfe_i32 v54, v33, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v32
+; SI-NEXT:    v_bfe_i32 v32, v32, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v35
+; SI-NEXT:    v_bfe_i32 v56, v35, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v34
+; SI-NEXT:    v_bfe_i32 v34, v34, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v37
+; SI-NEXT:    v_bfe_i32 v58, v37, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v36
+; SI-NEXT:    v_bfe_i32 v36, v36, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v39
+; SI-NEXT:    v_bfe_i32 v60, v39, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v38
+; SI-NEXT:    v_bfe_i32 v38, v38, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v41
+; SI-NEXT:    v_bfe_i32 v62, v41, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v40
+; SI-NEXT:    v_bfe_i32 v40, v40, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[48:49] offset1:1
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v26, v29, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v32
+; VI-NO-DS128-NEXT:    v_bfe_i32 v37, v32, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
+; VI-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
+; VI-NO-DS128-NEXT:    v_bfe_i32 v43, v33, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v46, 16, v36
+; VI-NO-DS128-NEXT:    v_bfe_i32 v45, v36, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v48, 16, v35
+; VI-NO-DS128-NEXT:    v_bfe_i32 v47, v35, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v50, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v49, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v52, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v51, v29, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
+; VI-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v17
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v17, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v21
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v21, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v20
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v20, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v23
+; VI-NO-DS128-NEXT:    v_bfe_i32 v20, v23, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v22
+; VI-NO-DS128-NEXT:    v_bfe_i32 v22, v22, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v58, 16, v34
+; VI-NO-DS128-NEXT:    v_bfe_i32 v57, v34, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v33
+; VI-NO-DS128-NEXT:    v_bfe_i32 v33, v33, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v36
+; VI-NO-DS128-NEXT:    v_bfe_i32 v59, v36, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v35
+; VI-NO-DS128-NEXT:    v_bfe_i32 v35, v35, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v61, v29, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v28
+; VI-NO-DS128-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v26, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v32
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v37, v32, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v43, v33, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v46, 16, v36
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v45, v36, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v48, 16, v35
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v47, v35, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v50, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v49, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v52, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v51, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v17
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v15, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v17, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v21
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v21, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v20
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v20, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v23
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v23, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v22
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v22, v22, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v58, 16, v34
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v57, v34, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v33
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v33, v33, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v36
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v59, v36, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v35
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v35, v35, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v61, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v28
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v64i16_to_v64i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 116, @30, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.W, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Y, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Z, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.W, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Y, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Z, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.W, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Y, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Z, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.W, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Y, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Z, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 85, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.W, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Y, OQAP,
+; EG-NEXT:     LSHR T10.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_READ_RET * OQAP, T11.W
+; EG-NEXT:     MOV T10.Z, OQAP,
+; EG-NEXT:     LSHR * T11.Z, T10.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T10.W, T10.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T1.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T2.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T2.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T3.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T3.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T3.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T4.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T4.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:    ALU 83, @32, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T4.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T5.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T5.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T5.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T6.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T6.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T6.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T7.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T7.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T7.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T8.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T8.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T8.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T9.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T9.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T10.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT:    ALU 94, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     BFE_INT T9.W, T9.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T10.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T10.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT * T0.W, T7.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 34, @34, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T7.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T7.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T9.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T9.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T10.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s1
+; VI-DS128-NEXT:    ds_read_b128 v[8:11], v32
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; VI-DS128-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; VI-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v6, 16, v17
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v16
+; VI-DS128-NEXT:    v_bfe_i32 v5, v17, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v3, v16, 0, 16
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
+; VI-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v27
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v26
+; VI-DS128-NEXT:    v_bfe_i32 v18, v27, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v26, 0, 16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v36
+; VI-DS128-NEXT:    v_bfe_i32 v26, v36, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
+; VI-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v53, 16, v40
+; VI-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v11
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; VI-DS128-NEXT:    v_bfe_i32 v46, v39, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v44, v38, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v10
+; VI-DS128-NEXT:    v_mov_b32_e32 v23, v15
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
+; VI-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
+; VI-DS128-NEXT:    v_bfe_i32 v22, v11, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v20, v10, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v25
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v24
+; VI-DS128-NEXT:    v_bfe_i32 v10, v25, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v24, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
+; VI-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v30, v34, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v28, v33, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; VI-DS128-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v43
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v42
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v55, 16, v41
+; VI-DS128-NEXT:    v_bfe_i32 v35, v43, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v33, v42, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v54, v41, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v59
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v58
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v57
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v56
+; VI-DS128-NEXT:    v_bfe_i32 v61, v59, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v59, v58, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v57, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v56, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v43, 16, v40
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v41, 16, v39
+; VI-DS128-NEXT:    v_bfe_i32 v42, v40, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v40, v39, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v32, v[40:43] offset:240
+; VI-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:192
+; VI-DS128-NEXT:    ds_write_b128 v32, v[59:62] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v32, v[52:55] offset:160
+; VI-DS128-NEXT:    ds_write_b128 v32, v[33:36] offset:176
+; VI-DS128-NEXT:    ds_write_b128 v32, v[48:51] offset:128
+; VI-DS128-NEXT:    ds_write_b128 v32, v[44:47] offset:144
+; VI-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:80
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:32
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v32, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v32
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GFX9-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v6, 16, v17
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v16
+; GFX9-DS128-NEXT:    v_bfe_i32 v5, v17, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v3, v16, 0, 16
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
+; GFX9-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v27
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v26
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v27, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v26, 0, 16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v36
+; GFX9-DS128-NEXT:    v_bfe_i32 v26, v36, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
+; GFX9-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v53, 16, v40
+; GFX9-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v10
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; GFX9-DS128-NEXT:    v_bfe_i32 v46, v39, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v44, v38, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v15
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v22, v11, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v20, v10, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v25
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v24
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v25, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v24, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
+; GFX9-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v30, v34, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v28, v33, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; GFX9-DS128-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v43
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v42
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v55, 16, v41
+; GFX9-DS128-NEXT:    v_bfe_i32 v35, v43, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v33, v42, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v54, v41, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v59
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v58
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v57
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v56
+; GFX9-DS128-NEXT:    v_bfe_i32 v61, v59, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v59, v58, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v57, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v56, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v43, 16, v40
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v41, 16, v39
+; GFX9-DS128-NEXT:    v_bfe_i32 v42, v40, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v40, v39, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[40:43] offset:240
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:192
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[59:62] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[52:55] offset:160
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[33:36] offset:176
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[48:51] offset:128
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[44:47] offset:144
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:80
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:32
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
 define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_i16_to_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 8, @35, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     MOV T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i64
   store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
 ; FIXME: Need to optimize this sequence to avoid an extra shift.
 ;  t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
 ;          t28: i64 = any_extend t25
 ;        t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
-; SI: ds_read_i16 v[[LO:[0-9]+]],
-; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
-; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
-; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_i16_to_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i64
   store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
 define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v1i16_to_v1i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 8, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     MOV T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v1i16_to_v1i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 17, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b32 v2, v0
+; GFX9-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 18, @40, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b32 v1, v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b32 v1, v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v4i16_to_v4i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v3, 0
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v10, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v10, v[6:7], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, v2
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v9, v[7:8], v[5:6] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[1:2], v[7:8] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v4i16_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 35, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[7:8], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; VI-DS128-NEXT:    ds_write_b128 v9, v[0:3] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v9, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[6:7], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v6, s2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v9, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v7, v4, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT:    ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v9, v[5:6], v[7:8] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 39, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.Z, PV.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T1.W, PV.W, literal.y,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.Z,
+; EG-NEXT:     ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mov_b32_e32 v9, v5
+; SI-NEXT:    v_mov_b32_e32 v11, v5
+; SI-NEXT:    v_mov_b32_e32 v13, v5
+; SI-NEXT:    v_mov_b32_e32 v15, v5
+; SI-NEXT:    v_mov_b32_e32 v17, v5
+; SI-NEXT:    v_mov_b32_e32 v19, v5
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[18:19] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[9:10], v[2:3] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[6:7], v[8:9] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[5:6], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v5, v3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[4:5], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v13, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[0:1], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 71, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT:    v_mov_b32_e32 v11, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v10, v11
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v16, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT:    ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v16, v[0:1], v[12:13] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 80, @44, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.W, OQAP,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     BFE_INT T1.Z, T0.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T2.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T1.Z, literal.y,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T3.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T2.Z, literal.y,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     ASHR T2.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T1.Z,
+; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T2.Z,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T3.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v6, v0, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    ds_write_b128 v16, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v16, v[12:15] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v16, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v16, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[12:15] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v9, 0
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v11, v9
+; SI-NEXT:    v_mov_b32_e32 v13, v9
+; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_mov_b32_e32 v17, v9
+; SI-NEXT:    v_mov_b32_e32 v20, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11
+; SI-NEXT:    v_mov_b32_e32 v16, v9
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
+; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v5, v9
+; SI-NEXT:    ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v19, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NEXT:    v_mov_b32_e32 v4, v9
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v20, v[7:8], v[3:4] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v13, v8
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v4
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v12, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[0:1], v[11:12] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v14, v8
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v15, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[0:1], v[13:14] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 100, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     MOV * T3.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU 42, @46, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v26, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v22, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, v26
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v5
+; VI-DS128-NEXT:    ds_read_b128 v[13:16], v5 offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v11, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v19, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v26
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; VI-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; VI-DS128-NEXT:    v_and_b32_e32 v25, 0xffff, v14
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; VI-DS128-NEXT:    ds_write_b128 v14, v[21:24] offset:64
+; VI-DS128-NEXT:    v_mov_b32_e32 v21, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; VI-DS128-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; VI-DS128-NEXT:    ds_write_b128 v14, v[18:21] offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v16, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v28, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[15:18] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v14, v[25:28] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v21, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v25
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v28, s0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, v25
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[20:23] offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[17:20] offset:96
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v17, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v1
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[14:17] offset:32
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v5
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v27, v25
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[11:14] offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v9, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v11, v25
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[4:7] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[24:27] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v18, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v12, v3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v3
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v1
+; SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
+; SI-NEXT:    v_bfe_i32 v12, v14, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v7, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v19, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v17, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3
+; SI-NEXT:    v_bfe_i32 v3, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v18, v[1:2], v[12:13] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v19, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v18, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v19, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 101, @47, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     MOV * T1.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     BFE_INT T2.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV * T2.Z, OQAP,
+; EG-NEXT:     BFE_INT T3.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T2.W, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T4.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T3.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T5.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T6.Z, T0.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T5.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T6.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T9.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T3.W, T9.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T3.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T2.W,
+; EG-NEXT:     ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T3.Z,
+; EG-NEXT:     ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:    ALU 62, @48, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T4.Z,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T5.Z,
+; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[3:6], v0
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v0 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v6
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_bfe_i32 v11, v8, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-DS128-NEXT:    v_bfe_i32 v13, v8, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:80
+; VI-DS128-NEXT:    v_bfe_i32 v11, v7, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v15, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:64
+; VI-DS128-NEXT:    v_bfe_i32 v11, v15, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:112
+; VI-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_bfe_i32 v10, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; VI-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:96
+; VI-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v8, v[10:13] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[3:6], v0
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v0 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v8, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v3, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:80
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v7, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:64
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v15, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:112
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v4, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:96
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[10:13] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[2:5], v0 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    ds_read2_b64 v[6:9], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v19, v1
+; SI-NEXT:    v_mov_b32_e32 v21, v1
+; SI-NEXT:    v_mov_b32_e32 v22, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; SI-NEXT:    ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[14:17], v0 offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v9
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v17
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31
+; SI-NEXT:    v_mov_b32_e32 v18, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_mov_b32_e32 v20, v1
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v15
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v2
+; SI-NEXT:    v_mov_b32_e32 v4, v1
+; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT:    ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; SI-NEXT:    ds_write2_b64 v22, v[6:7], v[4:5] offset1:1
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v11
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; SI-NEXT:    v_mov_b32_e32 v6, v1
+; SI-NEXT:    ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v13, v1
+; SI-NEXT:    v_mov_b32_e32 v16, v1
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v19, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v21, v5
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v22, s0
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v4 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v9
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(10)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v14, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v13, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v19, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v21, v5
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v22, s0
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v9
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v14
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v12
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v11, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 105, @49, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     MOV * T5.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:    ALU 93, @50, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 87, @51, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    156(2.186026e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    140(1.961818e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    188(2.634441e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    172(2.410233e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    220(3.082857e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    204(2.858649e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    252(3.531272e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    236(3.307064e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    228(3.194960e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DS128-NEXT:    ds_read_b128 v[3:6], v1
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v52, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v8
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; VI-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v10
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[29:32], v1 offset:48
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v6
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v9
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v30
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v30
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
+; VI-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v32
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v31, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v49, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v51, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v29
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v29
+; VI-DS128-NEXT:    ds_write_b128 v52, v[48:51] offset:240
+; VI-DS128-NEXT:    v_mov_b32_e32 v46, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v48, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v27, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v29, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[45:48] offset:192
+; VI-DS128-NEXT:    v_mov_b32_e32 v43, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v45, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[26:29] offset:96
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v26, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v10
+; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v10
+; VI-DS128-NEXT:    ds_write_b128 v52, v[42:45] offset:160
+; VI-DS128-NEXT:    v_mov_b32_e32 v40, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v42, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[23:26] offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v21, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v23, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[39:42] offset:176
+; VI-DS128-NEXT:    v_mov_b32_e32 v37, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v39, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[20:23] offset:64
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v20, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v8
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[36:39] offset:128
+; VI-DS128-NEXT:    v_mov_b32_e32 v34, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v36, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[17:20] offset:80
+; VI-DS128-NEXT:    v_mov_b32_e32 v15, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v17, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; VI-DS128-NEXT:    ds_write_b128 v52, v[7:10] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v52, v[33:36] offset:144
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v33, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[14:17] offset:48
+; VI-DS128-NEXT:    v_mov_b32_e32 v12, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v52, v[30:33] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v52, v[11:14]
+; VI-DS128-NEXT:    ds_write_b128 v52, v[0:3] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[3:6], v1
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v52, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v8
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v10
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX9-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[29:32], v1 offset:48
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
+; GFX9-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v30
+; GFX9-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v30
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
+; GFX9-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v32
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v31, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v49, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v51, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v29
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v29
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[48:51] offset:240
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v46, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v48, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v27, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v29, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[45:48] offset:192
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v43, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v45, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[26:29] offset:96
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v26, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v10
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[42:45] offset:160
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[23:26] offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v21, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[39:42] offset:176
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v37, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v39, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[20:23] offset:64
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v8
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v10, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[36:39] offset:128
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v34, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v36, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[17:20] offset:80
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v17, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[7:10] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[33:36] offset:144
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v33, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[14:17] offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[30:33] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[11:14]
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[0:3] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v18, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v7
+; SI-NEXT:    v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    v_mov_b32_e32 v7, s0
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v5
+; SI-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v3
+; SI-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v1
+; SI-NEXT:    v_bfe_i32 v18, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v1, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v11
+; SI-NEXT:    v_bfe_i32 v18, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v9
+; SI-NEXT:    v_bfe_i32 v18, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_mov_b32_e32 v1, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v15
+; SI-NEXT:    v_bfe_i32 v17, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v13
+; SI-NEXT:    v_bfe_i32 v17, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v8
+; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
+; SI-NEXT:    v_bfe_i32 v1, v12, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v14, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v8, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v10, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; SI-NEXT:    v_bfe_i32 v9, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
+; SI-NEXT:    v_bfe_i32 v11, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v14, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT:    ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
+; SI-NEXT:    v_bfe_i32 v17, v18, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v7 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[12:15], v7 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[3:6], v7 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[7:10], v7 offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v17, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(6)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v13, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(9)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(9)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_bfe_i32 v5, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v21, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v19, v19, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v10, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v8, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v9, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[4:5], v[1:2] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v8 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v8 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v15, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[11:14], v8 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[7:10], v8 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v17, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v10, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v1, v20, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v12, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v19, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v13, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[5:6] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 107, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV * T5.W, OQAP,
+; EG-NEXT:     BFE_INT T0.Z, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T6.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T0.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T6.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T9.Z, T1.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T8.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T10.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T9.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T11.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT * T12.Z, T2.W, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 98, @53, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ASHR T6.W, T11.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T13.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T12.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T14.Z, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T15.Z, T3.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T14.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T16.Z, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T15.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T17.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T18.Z, T4.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T17.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T19.Z, T5.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T18.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T19.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 228(3.194960e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR * T0.W, T1.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 99, @54, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT:     ASHR T0.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT:     ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T10.Z,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T11.Z,
+; EG-NEXT:     ASHR T0.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T12.Z,
+; EG-NEXT:     ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T13.Z,
+; EG-NEXT:     ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T14.Z,
+; EG-NEXT:     ASHR T0.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 172(2.410233e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T15.Z,
+; EG-NEXT:     ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T16.Z,
+; EG-NEXT:     ASHR T0.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR * T0.W, T4.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 27, @55, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    200(2.802597e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T17.Z,
+; EG-NEXT:     ASHR T0.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T18.Z,
+; EG-NEXT:     ASHR T0.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T19.Z,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:48
+; VI-DS128-NEXT:    ds_read_b128 v[9:12], v4 offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    ds_read_b128 v[17:20], v4 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-DS128-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:224
+; VI-DS128-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v15, v3, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:240
+; VI-DS128-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:208
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
+; VI-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:192
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v12
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:160
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; VI-DS128-NEXT:    v_bfe_i32 v0, v13, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:176
+; VI-DS128-NEXT:    v_bfe_i32 v0, v9, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; VI-DS128-NEXT:    v_bfe_i32 v9, v10, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v11, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:144
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_bfe_i32 v9, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v11, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:128
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-DS128-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v20
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:96
+; VI-DS128-NEXT:    v_bfe_i32 v9, v5, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; VI-DS128-NEXT:    v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v17
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:112
+; VI-DS128-NEXT:    v_bfe_i32 v9, v17, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:64
+; VI-DS128-NEXT:    v_bfe_i32 v9, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v13, v18, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v15, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:80
+; VI-DS128-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v15, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v6, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12]
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v13 offset:48
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v13 offset:32
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v13
+; GFX9-DS128-NEXT:    ds_read_b128 v[18:21], v13 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[14:17] offset:224
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:240
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:208
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v2, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:192
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:160
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:176
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:144
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v20, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[2:5] offset:128
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v21
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:96
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:112
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v18, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:64
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v8, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:80
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-DS128-NEXT:    v_bfe_i32 v17, v10, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v19, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[17:20] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[6:9]
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[2:5] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(3) %out
@@ -948,19 +8949,95 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; }
 
 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
-; FUNC-LABEL: {{^}}local_v8i16_to_128:
-
-; SI-NOT: ds_read_b128
-; SI-NOT: ds_write_b128
-
-; CIVI: ds_read_b128
-; CIVI: ds_write_b128
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_v8i16_to_128:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_v8i16_to_128:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_v8i16_to_128:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_v8i16_to_128:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 25, @56, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_v8i16_to_128:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_v8i16_to_128:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %ld = load <8 x i16>, ptr addrspace(3) %in, align 16
   store <8 x i16> %ld, ptr addrspace(3) %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index be020457ce87d..9cc42ac448067 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s5, -1
 ; CHECK-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB8_6
 ; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; ALIGNED-NEXT:    s_movk_i32 s4, 0xf800
 ; ALIGNED-NEXT:    s_mov_b32 s5, -1
 ; ALIGNED-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
 ; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
 ; UNROLL3-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v0
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
 ; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 79f15123f2b26..29607681634ff 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,CI %s
 
 ; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
@@ -8,11 +9,16 @@
 ; TODO: Non-zero base offset for load and store combinations
 ; TODO: Same base addrspacecasted
 
-
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
-; GCN: buffer_store_short
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7bc8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
   store i8 123, ptr addrspace(1) %out.gep.1
@@ -20,11 +26,18 @@ define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i8_natural_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xc8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; GCN-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
   store i8 123, ptr addrspace(1) %out.gep.1
@@ -32,9 +45,16 @@ define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
-; GCN: buffer_store_dword v
 define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b01c8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
   store i16 123, ptr addrspace(1) %out.gep.1
@@ -42,9 +62,16 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
-; GCN: buffer_store_dword v
 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_0_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
   store i16 0, ptr addrspace(1) %out.gep.1
@@ -52,11 +79,18 @@ define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i16_natural_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
+; GCN-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
   store i16 123, ptr addrspace(1) %out.gep.1
@@ -64,11 +98,17 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
-; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
 
   store i32 123, ptr addrspace(1) %out.gep.1
@@ -76,33 +116,53 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
-; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i32_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   store float 1.0, ptr addrspace(1) %out.gep.1
   store i32 456, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
-; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
-; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_f32_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   store i32 123, ptr addrspace(1) %out.gep.1
   store float 4.0, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_4_constants_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x4d2
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x1c8
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x14d
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
@@ -114,9 +174,19 @@ define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
-; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_4_constants_f32_order:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 4.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
@@ -129,9 +199,19 @@ define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspac
 }
 
 ; First store is out of order.
-; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
-; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_4_constants_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 4.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
@@ -143,10 +223,19 @@ define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
-; GCN-AA: buffer_store_dwordx4 v
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_4_constants_mixed_i32_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GCN-NEXT:    v_mov_b32_e32 v1, 11
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 17
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
@@ -159,13 +248,32 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addr
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword v
-; CI-DAG: buffer_store_dwordx3
-; GCN-NOT: buffer_store_dword
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
+; SI-LABEL: merge_global_store_3_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; SI-NEXT:    v_mov_b32_e32 v0, 0x4d2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_3_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1c8
+; CI-NEXT:    v_mov_b32_e32 v0, 0x4d2
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
 
@@ -175,9 +283,19 @@ define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
-; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: merge_global_store_2_constants_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
 
   store i64 123, ptr addrspace(1) %out.gep.1
@@ -185,10 +303,40 @@ define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
+; SI-LABEL: merge_global_store_4_constants_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; SI-NEXT:    v_mov_b32_e32 v2, 0x14d
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x4d2
+; SI-NEXT:    v_mov_b32_e32 v2, 0x7b
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_4_constants_i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; CI-NEXT:    v_mov_b32_e32 v2, 0x14d
+; CI-NEXT:    v_mov_b32_e32 v3, v1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; CI-NEXT:    s_nop 0
+; CI-NEXT:    v_mov_b32_e32 v0, 0x4d2
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
   %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
   %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
@@ -200,10 +348,23 @@ define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx2 [[LOAD]]
 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_2_adjacent_loads_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
@@ -215,10 +376,21 @@ define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_2_adjacent_loads_i32_nonzero_base:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:8
+; GCN-NEXT:    s_mov_b32 s2, s6
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:8
+; GCN-NEXT:    s_endpgm
   %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
   %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
 
@@ -232,10 +404,24 @@ define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
-; GCN: buffer_load_dwordx2 v
-; GCN: buffer_store_dwordx2 v
 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_2_adjacent_loads_shuffle_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
@@ -247,10 +433,23 @@ define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
@@ -270,16 +469,43 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx2
-; SI-DAG: buffer_load_dword
-; CI-DAG: buffer_load_dwordx3
-; GCN: s_waitcnt
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword v
-; CI-DAG: buffer_store_dwordx3
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: merge_global_store_3_adjacent_loads_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_3_adjacent_loads_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    s_mov_b32 s11, s7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
+; CI-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
@@ -295,10 +521,23 @@ define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
@@ -318,10 +557,21 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_i32_nonzero_base:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:44
+; GCN-NEXT:    s_mov_b32 s2, s6
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:28
+; GCN-NEXT:    s_endpgm
   %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
   %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
   %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
@@ -343,11 +593,24 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: s_barrier
-; GCN: buffer_store_dwordx4 [[LOAD]]
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_inverse_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_barrier
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
@@ -373,12 +636,49 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr a
 
 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 ; should catch this?
-
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
-; GCN: buffer_load_dwordx4 v
-; GCN: s_barrier
-; GCN: buffer_store_dwordx4 v
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: merge_global_store_4_adjacent_loads_shuffle_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_barrier
+; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_4_adjacent_loads_shuffle_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    s_mov_b32 s11, s7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_barrier
+; CI-NEXT:    v_mov_b32_e32 v4, v2
+; CI-NEXT:    v_mov_b32_e32 v5, v1
+; CI-NEXT:    v_mov_b32_e32 v6, v0
+; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
@@ -402,11 +702,23 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
-; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
-; GCN: buffer_store_dword [[LOAD]]
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
   %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
   %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
@@ -426,17 +738,32 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_adjacent_loads_i8_natural_align:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
+; GCN-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
+; GCN-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:1
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:2
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:3
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
   %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
   %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
@@ -456,11 +783,23 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GCN-LABEL: merge_global_store_4_vector_elts_loads_v4i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
@@ -478,10 +817,16 @@ define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addr
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
-; GCN: ds_write_b16
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
+; GCN-LABEL: merge_local_store_2_constants_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x7bc8
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    ds_write_b16 v1, v0
+; GCN-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
 
   store i8 123, ptr addrspace(3) %out.gep.1
@@ -489,11 +834,28 @@ define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %ou
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
-; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
+; SI-LABEL: merge_local_store_2_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write2_b32 v2, v1, v0 offset1:1
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_local_store_2_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x9
+; CI-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
 
   store i32 123, ptr addrspace(3) %out.gep.1
@@ -501,17 +863,34 @@ define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
-; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
-; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
-; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
-
-; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
-; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
-; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
-
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
+; SI-LABEL: merge_local_store_4_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; SI-NEXT:    v_mov_b32_e32 v2, 0x14d
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v3, s0
+; SI-NEXT:    ds_write2_b32 v3, v1, v2 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v1, 0x4d2
+; SI-NEXT:    ds_write2_b32 v3, v1, v0 offset1:1
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_local_store_4_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x9
+; CI-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; CI-NEXT:    v_mov_b32_e32 v1, 0x14d
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    v_mov_b32_e32 v2, 0x4d2
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    ds_write2_b32 v3, v0, v1 offset0:2 offset1:3
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    ds_write2_b32 v3, v2, v0 offset1:1
+; CI-NEXT:    s_endpgm
   %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
   %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
   %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
@@ -523,13 +902,38 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
-; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
-; GCN: buffer_store_dword v[[HI]]
 define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
+; SI-LABEL: merge_global_store_5_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 9
+; SI-NEXT:    v_mov_b32_e32 v1, 12
+; SI-NEXT:    v_mov_b32_e32 v2, 16
+; SI-NEXT:    v_mov_b32_e32 v3, -12
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 11
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_5_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 9
+; CI-NEXT:    v_mov_b32_e32 v1, 12
+; CI-NEXT:    v_mov_b32_e32 v2, 16
+; CI-NEXT:    v_mov_b32_e32 v3, -12
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_nop 0
+; CI-NEXT:    v_mov_b32_e32 v0, 11
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
+; CI-NEXT:    s_endpgm
   store i32 9, ptr addrspace(1) %out, align 4
   %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
   store i32 12, ptr addrspace(1) %idx1, align 4
@@ -542,10 +946,40 @@ define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
+; SI-LABEL: merge_global_store_6_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 13
+; SI-NEXT:    v_mov_b32_e32 v1, 15
+; SI-NEXT:    v_mov_b32_e32 v2, 62
+; SI-NEXT:    v_mov_b32_e32 v3, 63
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 11
+; SI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_6_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 13
+; CI-NEXT:    v_mov_b32_e32 v1, 15
+; CI-NEXT:    v_mov_b32_e32 v2, 62
+; CI-NEXT:    v_mov_b32_e32 v3, 63
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_nop 0
+; CI-NEXT:    v_mov_b32_e32 v0, 11
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; CI-NEXT:    s_endpgm
   store i32 13, ptr addrspace(1) %out, align 4
   %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
   store i32 15, ptr addrspace(1) %idx1, align 4
@@ -560,11 +994,44 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
-; GCN: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx2
-; CI: buffer_store_dwordx3
 define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
+; SI-LABEL: merge_global_store_7_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 34
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
+; SI-NEXT:    v_mov_b32_e32 v2, 0x41
+; SI-NEXT:    v_mov_b32_e32 v3, 33
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0xd4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x62
+; SI-NEXT:    v_mov_b32_e32 v1, 0x5b
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_7_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 34
+; CI-NEXT:    v_mov_b32_e32 v1, 0x3e7
+; CI-NEXT:    v_mov_b32_e32 v2, 0x41
+; CI-NEXT:    v_mov_b32_e32 v3, 33
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_nop 0
+; CI-NEXT:    v_mov_b32_e32 v0, 0x62
+; CI-NEXT:    v_mov_b32_e32 v1, 0x5b
+; CI-NEXT:    v_mov_b32_e32 v2, 0xd4
+; CI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:16
+; CI-NEXT:    s_endpgm
   store i32 34, ptr addrspace(1) %out, align 4
   %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
   store i32 999, ptr addrspace(1) %idx1, align 4
@@ -581,11 +1048,43 @@ define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
-; GCN: s_endpgm
 define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
+; SI-LABEL: merge_global_store_8_constants_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 34
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
+; SI-NEXT:    v_mov_b32_e32 v2, 0x41
+; SI-NEXT:    v_mov_b32_e32 v3, 33
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0x62
+; SI-NEXT:    v_mov_b32_e32 v3, 0x5b
+; SI-NEXT:    v_mov_b32_e32 v4, 0xd4
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: merge_global_store_8_constants_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 34
+; CI-NEXT:    v_mov_b32_e32 v1, 0x3e7
+; CI-NEXT:    v_mov_b32_e32 v2, 0x41
+; CI-NEXT:    v_mov_b32_e32 v3, 33
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    v_mov_b32_e32 v4, 0xd4
+; CI-NEXT:    v_mov_b32_e32 v2, 0x62
+; CI-NEXT:    v_mov_b32_e32 v3, 0x5b
+; CI-NEXT:    v_mov_b32_e32 v5, v1
+; CI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
+; CI-NEXT:    s_endpgm
   store i32 34, ptr addrspace(1) %out, align 4
   %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
   store i32 999, ptr addrspace(1) %idx1, align 4
@@ -607,77 +1106,154 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %
 ; This requires handling of scalar_to_vector for v2i64 to avoid
 ; scratch usage.
 ; FIXME: Should do single load and store
-
-; GCN-LABEL: {{^}}copy_v3i32_align4:
-; GCN-NOT: SCRATCH_RSRC_DWORD
-; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-NOT: offen
-; GCN: s_waitcnt vmcnt
-; GCN-NOT: offen
-; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-
-; GCN: ScratchSize: 0{{$}}
 define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; SI-LABEL: copy_v3i32_align4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: copy_v3i32_align4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    s_mov_b32 s11, s7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
+; CI-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
   store <3 x i32> %vec, ptr addrspace(1) %out
   ret void
 }
-
-; GCN-LABEL: {{^}}copy_v3i64_align4:
-; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GCN-NOT: offen
-; GCN: s_waitcnt vmcnt
-; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
+
 define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GCN-LABEL: copy_v3i64_align4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
   store <3 x i64> %vec, ptr addrspace(1) %out
   ret void
 }
-
-; GCN-LABEL: {{^}}copy_v3f32_align4:
-; GCN-NOT: SCRATCH_RSRC_DWORD
-; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-NOT: offen
-; GCN: s_waitcnt vmcnt
-; GCN-NOT: offen
-; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: ScratchSize: 0{{$}}
+
 define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; SI-LABEL: copy_v3f32_align4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:8
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: copy_v3f32_align4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    s_mov_b32 s11, s7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
+; CI-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; CI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; CI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; CI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %vec = load <3 x float>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, ptr addrspace(1) %out
   ret void
 }
-
-; GCN-LABEL: {{^}}copy_v3f64_align4:
-; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GCN-NOT: offen
-; GCN: s_waitcnt vmcnt
-; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
+
 define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GCN-LABEL: copy_v3f64_align4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], 4.0
+; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GCN-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %vec = load <3 x double>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, ptr addrspace(1) %out
   ret void
 }
+; GCN: ScratchSize: 0{{$}}
 
 declare void @llvm.amdgcn.s.barrier() #1
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir
index 14ba8fccb172d..1c20db9577695 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir
@@ -484,7 +484,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -572,7 +572,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -638,7 +638,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -704,7 +704,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -792,7 +792,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -814,7 +814,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -836,7 +836,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -902,7 +902,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -924,7 +924,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -946,7 +946,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 6, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 6, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -968,7 +968,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -990,7 +990,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 4, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 4, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -1012,7 +1012,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -1034,7 +1034,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -1056,7 +1056,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
@@ -1078,7 +1078,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec
     ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 16, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vgpr_32 = COPY $vgpr0
     %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
deleted file mode 100644
index 42436a1b4c279..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
+++ /dev/null
@@ -1,180 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
-
-
-%pair = type { i32, i32 }
-
-define void @test_extractvalue_then_else(ptr %ptr, i1 %cond) {
-; GFX900-LABEL: test_extractvalue_then_else:
-; GFX900:       ; %bb.0: ; %if
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    flat_load_dword v3, v[0:1]
-; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
-; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX900-NEXT:    s_cbranch_execz .LBB0_2
-; GFX900-NEXT:  ; %bb.1: ; %else
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX900-NEXT:  .LBB0_2: ; %Flow
-; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    flat_store_dword v[0:1], v3
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-if:
-  %load_then = load %pair, ptr %ptr
-  br i1 %cond, label %then, label %else
-
-then:
-  %a_then = extractvalue %pair %load_then, 0
-  br label %merge
-
-else:
-  %a_else = extractvalue %pair %load_then, 0
-  %sum_else = add i32 %a_else, 1
-  br label %merge
-
-merge:
-  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
-  store i32 %phi, ptr  %ptr
-  ret void
-}
-
-define void @test_extractvalue_else_then(ptr %ptr, i1 %cond) {
-; GFX900-LABEL: test_extractvalue_else_then:
-; GFX900:       ; %bb.0: ; %if
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    flat_load_dword v3, v[0:1]
-; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
-; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX900-NEXT:    s_cbranch_execz .LBB1_2
-; GFX900-NEXT:  ; %bb.1: ; %else
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
-; GFX900-NEXT:  .LBB1_2: ; %merge
-; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    flat_store_dword v[0:1], v3
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-if:
-  %load_then = load %pair, ptr %ptr
-  br i1 %cond, label %else, label %then
-
-else:
-  %a_else = extractvalue %pair %load_then, 0
-  %sum_else = add i32 %a_else, 1
-  br label %merge
-
-then:
-  %a_then = extractvalue %pair %load_then, 0
-  br label %merge
-
-merge:
-  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
-  store i32 %phi, ptr  %ptr
-  ret void
-}
-
-define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
-; GFX900-LABEL: test_loop_with_if:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-NEXT:    v_mov_b32_e32 v5, 0
-; GFX900-NEXT:    s_mov_b64 s[4:5], 0
-; GFX900-NEXT:    s_movk_i32 s10, 0xfe
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_bitcmp1_b32 s2, 0
-; GFX900-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX900-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
-; GFX900-NEXT:    v_mov_b32_e32 v2, s1
-; GFX900-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; GFX900-NEXT:    v_mov_b32_e32 v1, s0
-; GFX900-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v3
-; GFX900-NEXT:    s_branch .LBB2_2
-; GFX900-NEXT:  .LBB2_1: ; %latch
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_add_u32_e32 v5, 20, v3
-; GFX900-NEXT:    v_cmp_lt_i32_e32 vcc, s10, v5
-; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX900-NEXT:    flat_store_dword v[1:2], v3
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX900-NEXT:    s_cbranch_execz .LBB2_8
-; GFX900-NEXT:  .LBB2_2: ; %loop
-; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX900-NEXT:    flat_load_dwordx2 v[3:4], v[1:2]
-; GFX900-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX900-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX900-NEXT:    s_mov_b64 s[6:7], 0
-; GFX900-NEXT:    s_cbranch_vccnz .LBB2_4
-; GFX900-NEXT:  ; %bb.3: ; %if
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    v_cmp_gt_i32_e32 vcc, 11, v5
-; GFX900-NEXT:    s_andn2_b64 s[8:9], s[2:3], exec
-; GFX900-NEXT:    s_and_b64 s[12:13], vcc, exec
-; GFX900-NEXT:    s_mov_b64 s[6:7], -1
-; GFX900-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX900-NEXT:  .LBB2_4: ; %Flow
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    s_and_saveexec_b64 s[12:13], s[8:9]
-; GFX900-NEXT:    s_xor_b64 s[8:9], exec, s[12:13]
-; GFX900-NEXT:    s_cbranch_execz .LBB2_6
-; GFX900-NEXT:  ; %bb.5: ; %else
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX900-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; GFX900-NEXT:  .LBB2_6: ; %Flow1
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX900-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GFX900-NEXT:    s_cbranch_execz .LBB2_1
-; GFX900-NEXT:  ; %bb.7: ; %then
-; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GFX900-NEXT:    flat_store_dword v[1:2], v0
-; GFX900-NEXT:    s_branch .LBB2_1
-; GFX900-NEXT:  .LBB2_8: ; %end
-; GFX900-NEXT:    s_endpgm
-entry:
-  %a = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %loop
-
-loop:
-  %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
-  %load = load %pair, ptr %ptr
-  br i1 %cond, label %if, label %else
-
-if:
-  %cmp = icmp sgt i32 %entry_phi, 10
-  br i1 %cmp, label %then, label %else
-
-then:
-  %a_then = extractvalue %pair %load, 0
-  store i32 %a, ptr %ptr, align 4
-  br label %latch
-
-else:
-  %a2 = extractvalue %pair %load, 1
-  %y = extractvalue %pair %load, 0
-  %a_else = add i32 %y, %a2
-  br label %latch
-
-latch:
-  %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
-  store i32 %a_test, ptr  %ptr
-  %a15 = add nsw i32 %a_test, 20
-  %a16 = icmp slt i32  %a15, 255
-  br i1 %a16, label %loop, label %end
-
-end:
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
new file mode 100644
index 0000000000000..586ddf627bd9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
@@ -0,0 +1,3243 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefix=PAIR %s
+
+---
+name:            vopd_combine_low_vgprs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_low_vgprs
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_low_vgprs
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_mov_max_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_mov_max_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_MAX_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_mov_max_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MAX_I32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr1, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_MAX_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_mov_min_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_mov_min_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_MIN_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_mov_min_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MIN_I32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr1, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_MIN_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_max_i32_max_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_max_i32_max_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MAX_I32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_MAX_I32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_max_i32_max_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = V_MAX_I32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_MAX_I32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MAX_I32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_MAX_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_min_i32_min_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_min_i32_min_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MIN_I32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_MIN_I32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_min_i32_min_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = V_MIN_I32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_MIN_I32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MIN_I32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_MIN_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_mov_sub_nc_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_mov_sub_nc_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_SUB_U32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_mov_sub_nc_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_SUB_U32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr1, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_SUB_U32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_mov_lshrrev_b32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_mov_lshrrev_b32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_LSHRREV_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_mov_lshrrev_b32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_LSHRREV_B32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr1, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_LSHRREV_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_mov_ashrrev_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_mov_ashrrev_i32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_ASHRREV_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_mov_ashrrev_i32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ASHRREV_I32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr1, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_ASHRREV_I32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_same_vgprs_banks
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_same_vgprs_banks
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_same_vgprs_banks
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, killed $vgpr5, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr5, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_same_vgprs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_same_vgprs
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_same_vgprs
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 $vgpr0, $vgpr1, $vgpr0, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_same_dst_parity
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_same_dst_parity
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr5 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_e96_gfx1250 0, $vgpr1, 0, $vgpr1, 0, $vgpr0, 0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_x_fmaak
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_x_fmaak
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = V_FMAAK_F32 killed $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr0, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_x_fmaak
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1, $vgpr2 = V_DUAL_FMAAK_F32_X_MOV_B32_e32_gfx1250 killed $sgpr0, $vgpr0, 981467136, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+---
+name:            vopd_combine_y_fmaak
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_y_fmaak
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr0, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_y_fmaak
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx1250 $vgpr0, killed $sgpr0, $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr0, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr2 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_x_fmaak_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_x_fmaak_same_dst_parity
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = V_FMAAK_F32 killed $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_x_fmaak_same_dst_parity
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = V_FMAAK_F32 killed $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+...
+
+---
+name:            vopd_no_combine_y_fmaak_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_y_fmaak_same_dst_parity
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_y_fmaak_same_dst_parity
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr3 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_literal_x
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_literal_x
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_literal_x
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 12345, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_literal_y
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_literal_y
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_literal_y
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_MUL_F32_e32_X_SUB_F32_e32_gfx1250 $vgpr0, $vgpr0, 12345, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+...
+
+# Below 2 tests cannot use VOPD because of the vdst parity and cannot use
+# VOPD3 because of the literal use.
+---
+name:            vopd_no_combine_literal_x
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_literal_x
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_literal_x
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_literal_y
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_literal_y
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_SUB_F32_e32 12345, killed $vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_literal_y
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e32 killed $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_SUB_F32_e32 12345, killed $vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_SUB_F32_e32 12345, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_u32_add_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_u32_add_f32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_u32_add_f32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4, $vgpr6 = V_DUAL_ADD_U32_e32_X_ADD_F32_e32_e96_gfx1250 $vgpr0, $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f32_add_u32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f32_add_u32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f32_add_u32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr4 = V_DUAL_ADD_F32_e32_X_ADD_U32_e32_e96_gfx1250 0, killed $vgpr2, 0, killed $vgpr3, $vgpr0, $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name:            vopd_combine_add_u32_add_f32_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_u32_add_f32_same_dst_parity
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_ADD_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_u32_add_f32_same_dst_parity
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5, $vgpr4 = V_DUAL_ADD_F32_e32_X_ADD_U32_e32_gfx1250 killed $vgpr2, killed $vgpr3, killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f32_add_u32_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f32_add_u32_same_dst_parity
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_ADD_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f32_add_u32_same_dst_parity
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5, $vgpr4 = V_DUAL_ADD_F32_e32_X_ADD_U32_e32_gfx1250 killed $vgpr2, killed $vgpr3, killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr5 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name:            vopd_combine_lshl_lshl
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_lshl_lshl
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_LSHLREV_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_LSHLREV_B32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_lshl_lshl
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4, $vgpr6 = V_DUAL_LSHLREV_B32_e32_X_LSHLREV_B32_e32_e96_gfx1250 killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = V_LSHLREV_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_LSHLREV_B32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_ashr_ashr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_ashr_ashr
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_ASHRREV_I32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_ASHRREV_I32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_ashr_ashr
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4, $vgpr5 = V_DUAL_ASHRREV_I32_e32_X_ASHRREV_I32_e32_e96_gfx1250 killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = V_ASHRREV_I32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_ASHRREV_I32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_sub_u32_sub_u32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_sub_u32_sub_u32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_SUB_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_SUB_U32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_sub_u32_sub_u32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4, $vgpr5 = V_DUAL_SUB_U32_e32_X_SUB_U32_e32_e96_gfx1250 killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, implicit $exec, implicit $exec, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = V_SUB_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_SUB_U32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_sub_u32_sub_u32_lit
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_sub_u32_sub_u32_lit
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = V_SUB_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_SUB_U32_e32 300, killed $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_sub_u32_sub_u32_lit
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = V_SUB_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_SUB_U32_e32 300, killed $vgpr2, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr4 = V_SUB_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_SUB_U32_e32 300, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fmac_fmac
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fmac_fmac
+    ; SCHED: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fmac_fmac
+    ; PAIR: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMAC_F32_e32_gfx1250 $vgpr1, $vgpr1, killed $vgpr2, killed $vgpr1, $vgpr1, killed $vgpr3, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr3 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fmac_fmac_same_dst_parity
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fmac_fmac_same_dst_parity
+    ; SCHED: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr4, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fmac_fmac_same_dst_parity
+    ; PAIR: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2, $vgpr4 = V_DUAL_FMAC_F32_e32_X_FMAC_F32_e32_e96_gfx1250 0, $vgpr1, 0, $vgpr1, killed $vgpr2, 0, killed $vgpr1, 0, $vgpr1, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr4 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fmac_fmac_same_dst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fmac_fmac_same_dst
+    ; SCHED: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fmac_fmac_same_dst
+    ; PAIR: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_add_f32_fadd_f32_same_dst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_add_f32_fadd_f32_same_dst
+    ; SCHED: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_add_f32_fadd_f32_same_dst
+    ; PAIR: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f64_add_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f64_add_f32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f64_add_f32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5, $vgpr6 = V_DUAL_ADD_F64_pseudo_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr8_vgpr9, 0, killed $vgpr2, 0, killed $vgpr3, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr8_vgpr9, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f32_add_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f32_add_f64
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f32_add_f64
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5, $vgpr6 = V_DUAL_ADD_F64_pseudo_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr8_vgpr9, 0, killed $vgpr2, 0, killed $vgpr3, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = IMPLICIT_DEF
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr8_vgpr9, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_add_f64_add_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_add_f64_add_f64
+    ; SCHED: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_add_f64_add_f64
+    ; PAIR: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr8_vgpr9, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6_vgpr7 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr10_vgpr11, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_add_f64_add_f32_overlapping_dst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_dst
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_dst
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr8_vgpr9, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr3, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr8_vgpr9, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_add_f64_add_f32_overlapping_src_sub1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_src_sub1
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_src_sub1
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr5, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr10_vgpr11, implicit $mode, implicit $exec
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr5, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_add_f64_add_f32_overlapping_src_sub0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_src_sub0
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr4, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_add_f64_add_f32_overlapping_src_sub0
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 killed $vgpr0_vgpr1, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_ADD_F32_e32 killed $vgpr2, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr10_vgpr11, implicit $mode, implicit $exec
+    $vgpr6 = V_ADD_F32_e32 $vgpr2, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_fma
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_fma
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_fma_bank_conflict_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_fma_bank_conflict_src2
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_fma_bank_conflict_src2
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr10 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_add_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_add_f32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_add_f32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f32_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f32_fma
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f32_fma
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7, $vgpr6 = V_DUAL_ADD_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, killed $vgpr3, 0, killed $vgpr4, 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_add_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_add_f64
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 killed $vgpr2_vgpr3, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_add_f64
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr6 = V_DUAL_ADD_F64_pseudo_e32_X_FMA_F32_e64_e96_gfx1250 0, killed $vgpr2_vgpr3, 0, killed $vgpr10_vgpr11, 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr2_vgpr3, $vgpr10_vgpr11, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_src0_mod_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_src0_mod_fma
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 3, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_src0_mod_fma
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 3, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 3, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_fma_src1_mod
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_fma_src1_mod
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 2, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_fma_src1_mod
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 2, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 2, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_fma_src2_mod
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_fma_src2_mod
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 3, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_fma_src2_mod
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 3, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 3, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_clamp_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_clamp_fma
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 1, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_clamp_fma
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 1, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_fma_omod
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_fma_omod
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_fma_omod
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_fma_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_fma_neg
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 1, $vgpr0, 1, $vgpr1, 1, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 1, killed $vgpr3, 1, killed $vgpr4, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_fma_neg
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 1, $vgpr0, 1, $vgpr1, 1, killed $vgpr2, 1, killed $vgpr3, 1, killed $vgpr4, 1, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 1, $vgpr0, 1, $vgpr1, 1, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 1, $vgpr3, 1, $vgpr4, 1, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_fma_src0_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_fma_src0_neg
+    ; SCHED: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 1, $sgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $sgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_fma_src0_neg
+    ; PAIR: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 1, $sgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $sgpr0, killed $vgpr1, implicit $exec
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 1, $sgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $sgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_fma_src1_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_fma_src1_neg
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 1, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_fma_src1_neg
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, 1, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 1, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_fma_src2_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_fma_src2_neg
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 1, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_fma_src2_neg
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 1, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 1, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_f64_fma_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_f64_fma_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = V_FMA_F64_e64 1, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 1, killed $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr8, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_f64_fma_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11, $vgpr9 = V_DUAL_FMA_F64_e64_X_FMA_F32_e64_e96_gfx1250 1, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 1, killed $vgpr4_vgpr5, 0, killed $vgpr6, 0, killed $vgpr8, 0, killed $vgpr7, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = IMPLICIT_DEF
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = V_FMA_F64_e64 1, $vgpr0_vgpr1, 1, $vgpr2_vgpr3, 1, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr9 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr8, 0, $vgpr7, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_lshl_add_u64_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_lshl_add_u64_fma
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr2, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_lshl_add_u64_fma
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr2, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr2, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_lshl_add_u64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_lshl_add_u64
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr2, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 killed $vgpr0_vgpr1, $vgpr1, killed $vgpr2_vgpr3, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_lshl_add_u64
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr2, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 killed $vgpr0_vgpr1, $vgpr1, killed $vgpr2_vgpr3, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr2, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+...
+
+---
+name:            vopd_no_combine_lshl_add_u64_fma_overlapping_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_lshl_add_u64_fma_overlapping_src2
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_lshl_add_u64_fma_overlapping_src2
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr1, $vgpr2_vgpr3, implicit $exec
+    $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_lshl_add_u64_fma_src0_conflict
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_lshl_add_u64_fma_src0_conflict
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, killed $vgpr5, $vgpr2_vgpr3, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr1, 0, killed $vgpr3, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_lshl_add_u64_fma_src0_conflict
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, killed $vgpr5, $vgpr2_vgpr3, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr1, 0, killed $vgpr3, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr5, $vgpr2_vgpr3, implicit $exec
+    $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8 = V_FMA_F32_e64 0, $vgpr1, 0, $vgpr3, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_lshl_add_u64_fma_src1_conflict
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_lshl_add_u64_fma_src1_conflict
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr5, $vgpr2_vgpr3, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr5, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_lshl_add_u64_fma_src1_conflict
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr5, $vgpr2_vgpr3, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr5, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6_vgpr7 = V_LSHL_ADD_U64_e64 $vgpr0_vgpr1, $vgpr5, $vgpr2_vgpr3, implicit $exec
+    $vgpr9 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr5, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_f64_fma_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_f64_fma_f32
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr8, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_f64_fma_f32
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11, $vgpr9 = V_DUAL_FMA_F64_e64_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr4_vgpr5, 0, killed $vgpr6, 0, killed $vgpr8, 0, killed $vgpr7, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = IMPLICIT_DEF
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr9 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr8, 0, $vgpr7, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fma_f64_fma_f32_overlapping_src1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fma_f64_fma_f32_overlapping_src1
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, killed $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr9 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr3, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fma_f64_fma_f32_overlapping_src1
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4_vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, killed $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr12 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr9 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr3, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr12 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr9 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr3, 0, $vgpr7, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_f32_add_f64_e32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_f32_add_f64_e32
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_f32_add_f64_e32
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_ADD_F64_pseudo_e32_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr10 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_f32_add_f64_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_f32_add_f64_e64
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e64 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_f32_add_f64_e64
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_ADD_F64_pseudo_e32_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr10 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8_vgpr9 = V_ADD_F64_pseudo_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_f32_add_f64_e64_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_f32_add_f64_e64_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e64 1, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_f32_add_f64_e64_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_ADD_F64_pseudo_e32_X_FMA_F32_e64_e96_gfx1250 1, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 0, killed $vgpr6, 0, killed $vgpr4, 0, killed $vgpr5, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr10 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr8_vgpr9 = V_ADD_F64_pseudo_e64 1, $vgpr0_vgpr1, 1, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fma_bitop
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_bitop
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_BITOP3_B32_e64 killed $vgpr3, killed $vgpr4, 0, 123, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_bitop
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5, $vgpr6 = V_DUAL_FMA_F32_e64_X_BITOP2_B32_e64_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, killed $vgpr3, killed $vgpr4, 123, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_BITOP3_B32_e64 $vgpr3, $vgpr4, 0, 123, implicit $exec
+...
+
+# Make sure bitop3 modifier does not count against constant bus limit.
+---
+name:            vopd_combine_fma_bitop_2_scalar_src
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fma_bitop_2_scalar_src
+    ; SCHED: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = V_FMA_F32_e64 0, $sgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_BITOP3_B32_e64 killed $sgpr3, killed $vgpr4, 0, 123, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_BFM_B32_e64 killed $sgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fma_bitop_2_scalar_src
+    ; PAIR: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5, $vgpr6 = V_DUAL_FMA_F32_e64_X_BITOP2_B32_e64_e96_gfx1250 0, $sgpr0, 0, $vgpr1, 0, killed $vgpr2, killed $sgpr3, killed $vgpr4, 123, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_BFM_B32_e64 killed $sgpr0, killed $vgpr1, implicit $exec
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $sgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = V_FMA_F32_e64 0, $sgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr7 = V_BFM_B32_e64 $sgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_BITOP3_B32_e64 $sgpr3, $vgpr4, 0, 123, implicit $exec
+...
+
+---
+name:            vopd_combine_bitop_mov_b32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_bitop_mov_b32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_BITOP3_B32_e64 $vgpr0, $vgpr1, 0, 20, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_bitop_mov_b32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5, $vgpr3 = V_DUAL_MOV_B32_e32_X_BITOP2_B32_e64_e96_gfx1250 killed $vgpr2, $vgpr0, $vgpr1, 20, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_BITOP3_B32_e64 $vgpr0, $vgpr1, 0, 20, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec
+...
+
+---
+name:            vopd_no_combine_mov_b32_bitop_non_imm_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_mov_b32_bitop_non_imm_src2
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_BITOP3_B32_e64 killed $vgpr0, killed $vgpr1, killed $vgpr2, 20, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_mov_b32_bitop_non_imm_src2
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_BITOP3_B32_e64 killed $vgpr0, killed $vgpr1, killed $vgpr2, 20, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_BITOP3_B32_e64 $vgpr0, $vgpr1, $vgpr2, 20, implicit $exec
+...
+
+---
+name:            vopd_no_combine_mov_b32_bitop_non_zero_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_mov_b32_bitop_non_zero_src2
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr3 = V_BITOP3_B32_e64 killed $vgpr0, killed $vgpr1, 1, 20, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_mov_b32_bitop_non_zero_src2
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr3 = V_BITOP3_B32_e64 killed $vgpr0, killed $vgpr1, 1, 20, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr5 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr3 = V_BITOP3_B32_e64 $vgpr0, $vgpr1, 1, 20, implicit $exec
+...
+
+---
+name: vopd_no_combine_bitop3_mov_dpp_vgpr_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; SCHED-LABEL: name: vopd_no_combine_bitop3_mov_dpp_vgpr_src2
+    ; SCHED: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: renamable $vgpr1 = V_MOV_B32_dpp killed $vgpr1, $vgpr3, 258, 15, 15, 0, implicit $exec
+    ; SCHED-NEXT: renamable $vgpr1 = V_BITOP3_B32_e64 killed $vgpr3, killed $vgpr4, killed $vgpr1, 128, implicit $exec
+    ; SCHED-NEXT: renamable $vgpr3 = V_MOV_B32_e32 -1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_bitop3_mov_dpp_vgpr_src2
+    ; PAIR: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: renamable $vgpr1 = V_MOV_B32_dpp killed $vgpr1, $vgpr3, 258, 15, 15, 0, implicit $exec
+    ; PAIR-NEXT: renamable $vgpr1 = V_BITOP3_B32_e64 killed $vgpr3, killed $vgpr4, killed $vgpr1, 128, implicit $exec
+    ; PAIR-NEXT: renamable $vgpr3 = V_MOV_B32_e32 -1, implicit $exec
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+
+    renamable $vgpr1 = V_MOV_B32_dpp killed $vgpr1, $vgpr3, 258, 15, 15, 0, implicit $exec
+    renamable $vgpr1 = V_BITOP3_B32_e64 killed $vgpr3, $vgpr4, killed $vgpr1, 128, implicit $exec
+    renamable $vgpr3 = V_MOV_B32_e32 -1, implicit $exec
+...
+
+---
+name:            vopd_combine_mov_or
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mov_or
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_OR_B32_e32 $vgpr1, killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mov_or
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr5 = V_DUAL_MOV_B32_e32_X_BITOP2_B32_e64_e96_gfx1250 $vgpr0, $vgpr1, killed $vgpr2, 84, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_OR_B32_e32 $vgpr1, $vgpr2, implicit $exec
+...
+
+---
+name:            vopd_combine_mov_and
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mov_and
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_AND_B32_e32 $vgpr1, killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mov_and
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr5 = V_DUAL_MOV_B32_e32_X_BITOP2_B32_e64_e96_gfx1250 $vgpr0, $vgpr1, killed $vgpr2, 64, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_AND_B32_e32 $vgpr1, $vgpr2, implicit $exec
+...
+
+---
+name:            vopd_combine_mov_xor
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mov_xor
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mov_xor
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr5 = V_DUAL_MOV_B32_e32_X_BITOP2_B32_e64_e96_gfx1250 $vgpr0, $vgpr1, killed $vgpr2, 20, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr2, implicit $exec
+...
+
+---
+name:            vopd_combine_mov_xnor
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mov_xnor
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_XNOR_B32_e32 $vgpr1, killed $vgpr2, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mov_xnor
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3, $vgpr5 = V_DUAL_MOV_B32_e32_X_BITOP2_B32_e64_e96_gfx1250 $vgpr0, $vgpr1, killed $vgpr2, 65, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_XNOR_B32_e32 $vgpr1, $vgpr2, implicit $exec
+...
+
+# V_NOT_B32 can also be combined to BITOP2, but we need to come up with a fake src1
+# which would satisfy all register constraints and does not break liveness.
+# This is not trivial at the very least.
+---
+name:            vopd_combine_mov_not
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mov_not
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_NOT_B32_e32 killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mov_not
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_NOT_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_NOT_B32_e32 $vgpr1, implicit $exec
+...
+
+---
+name:            vopd_combine_fadd_not
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fadd_not
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_NOT_B32_e32 killed $vgpr2, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fadd_not
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr4 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_NOT_B32_e32 killed $vgpr2, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
+    $vgpr4 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr5 = V_NOT_B32_e32 $vgpr2, implicit $exec
+...
+
+---
+name:            vopd_combine_fadd_f64_not
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fadd_f64_not
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr11 = V_NOT_B32_e32 killed $vgpr6, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fadd_f64_not
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr10 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr11 = V_NOT_B32_e32 killed $vgpr6, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_ADD_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr10 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr11 = V_NOT_B32_e32 $vgpr6, implicit $exec
+...
+
+---
+name:            vopd_no_combine_src1_imm
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_src1_imm
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, 1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_src1_imm
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, 1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, 1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_src2_imm
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_src2_imm
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, 1, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_src2_imm
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, 1, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, 1, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_src1_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_src1_sgpr
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, killed $sgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_src1_sgpr
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, killed $sgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr1 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $sgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_src2_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_src2_sgpr
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $sgpr1, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_src2_sgpr
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $sgpr1, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr1 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $sgpr1, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_fadd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vcc_lo
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_fadd
+    ; SCHED: liveins: $vcc_lo
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $exec, implicit killed $vcc_lo
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_fadd
+    ; PAIR: liveins: $vcc_lo
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx1250 $vgpr0, $vgpr1, killed $vgpr3, killed $vgpr4, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $exec, implicit killed $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $exec, implicit $vcc
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_fma
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vcc_lo
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_fma
+    ; SCHED: liveins: $vcc_lo
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $exec, implicit killed $vcc_lo
+    ; SCHED-NEXT: $vgpr7 = V_FMA_F32_e64 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_fma
+    ; PAIR: liveins: $vcc_lo
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_FMA_F32_e64_e96_gfx1250 0, $vgpr0, 0, $vgpr1, $vcc_lo, 0, killed $vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, implicit $exec, implicit $mode, implicit $exec, implicit killed $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $exec, implicit $vcc
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_FMA_F32_e64 0, $vgpr3, 0, $vgpr4, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_vcc_fadd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_vcc_fadd
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vcc = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_vcc_fadd
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vcc = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, killed $vcc_lo, 0, killed $vgpr3, 0, killed $vgpr4, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vcc = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, $vcc_lo, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_sgpr_fadd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_sgpr_fadd
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, killed $sgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_sgpr_fadd
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, killed $sgpr0, 0, killed $vgpr3, 0, killed $vgpr4, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, $sgpr0, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_neg_vcc_fadd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_neg_vcc_fadd
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vcc = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 1, $vgpr1, killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_neg_vcc_fadd
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vcc = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 1, $vgpr1, killed $vcc_lo, 0, killed $vgpr3, 0, killed $vgpr4, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vcc = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 1, $vgpr1, $vcc_lo, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_cndmask_e64_vcc_fadd_constant_bus_limit
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_cndmask_e64_vcc_fadd_constant_bus_limit
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vcc = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, killed $sgpr0, 0, $vgpr1, killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $sgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_cndmask_e64_vcc_fadd_constant_bus_limit
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vcc = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, killed $sgpr0, 0, $vgpr1, killed $vcc_lo, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_ADD_F32_e32 killed $sgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $sgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vcc = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $sgpr0, 0, $vgpr1, $vcc_lo, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $sgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_cndmask_e64_vcc_fadd_sgpr_src1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_cndmask_e64_vcc_fadd_sgpr_src1
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vcc = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, killed $sgpr0, killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $sgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_cndmask_e64_vcc_fadd_sgpr_src1
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vcc = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, killed $sgpr0, killed $vcc_lo, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_ADD_F32_e32 killed $sgpr3, killed $vgpr4, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $sgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vcc = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $sgpr0, $vcc_lo, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $sgpr3, $vgpr4, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_cndmask_e32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vcc_lo
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_cndmask_e32
+    ; SCHED: liveins: $vcc_lo
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, killed $sgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr3, killed $vgpr4, implicit killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_cndmask_e32
+    ; PAIR: liveins: $vcc_lo
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, killed $sgpr0, 0, killed $vgpr3, 0, killed $vgpr4, $vcc_lo, implicit $exec, implicit $exec, implicit killed $vcc_lo, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, $sgpr0, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_CNDMASK_B32_e32 $vgpr3, $vgpr4, implicit $vcc, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e32_cndmask_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vcc_lo
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e32_cndmask_e64
+    ; SCHED: liveins: $vcc_lo
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e64 0, killed $vgpr3, 0, killed $vgpr4, killed $sgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e32_cndmask_e64
+    ; PAIR: liveins: $vcc_lo
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, $vcc_lo, 0, killed $vgpr3, 0, killed $vgpr4, killed $sgpr0, implicit $exec, implicit killed $vcc_lo, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $vcc, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e32_cndmask_e32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vcc_lo
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e32_cndmask_e32
+    ; SCHED: liveins: $vcc_lo
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr3, killed $vgpr4, implicit killed $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e32_cndmask_e32
+    ; PAIR: liveins: $vcc_lo
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_gfx1250 $vgpr0, $vgpr1, killed $vgpr3, killed $vgpr4, implicit $vcc_lo, implicit $exec, implicit $vcc_lo, implicit $exec, implicit killed $vcc_lo, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e32 $vgpr0, $vgpr1, implicit $vcc, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_CNDMASK_B32_e32 $vgpr3, $vgpr4, implicit $vcc, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_cndmask_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_cndmask_e64
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, killed $sgpr0, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e64 0, killed $vgpr3, 0, killed $vgpr4, killed $sgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_cndmask_e64
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, killed $sgpr0, 0, killed $vgpr3, 0, killed $vgpr4, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $sgpr1 = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr1, $sgpr0, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr1, implicit $exec
+...
+
+---
+name:            vopd_combine_fadd_e64_fadd_e64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fadd_e64_fadd_e64
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e64 0, killed $vgpr3, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fadd_e64_fadd_e64
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_ADD_F32_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr3, 0, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e64 0, $vgpr3, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_fadd_e64_neg_fadd_e32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_fadd_e64_neg_fadd_e32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 1, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr2, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_fadd_e64_neg_fadd_e32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_ADD_F32_e32_X_ADD_F32_e32_e96_gfx1250 0, $vgpr0, 1, $vgpr1, 0, killed $vgpr3, 0, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 1, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_no_combine_fadd_e64_abs_neg_fadd_e32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_no_combine_fadd_e64_abs_neg_fadd_e32
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 3, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_fadd_e64_abs_neg_fadd_e32
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 3, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr8 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_ADD_F32_e32 killed $vgpr3, killed $vgpr2, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = V_ADD_F32_e64 0, $vgpr0, 3, $vgpr1, 0, 0, implicit $mode, implicit $exec
+    $vgpr8 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_mul_f64_e64_sub_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mul_f64_e64_sub_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MUL_F64_pseudo_e64 1, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_SUB_F32_e64 0, killed $vgpr6, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mul_f64_e64_sub_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MUL_F64_pseudo_e32_X_SUB_F32_e32_e96_gfx1250 1, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr6, 1, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MUL_F64_pseudo_e64 1, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_SUB_F32_e64 0, $vgpr6, 1, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_mul_f64_e32_subrev_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_mul_f64_e32_subrev_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MUL_F64_pseudo_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_SUBREV_F32_e64 1, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_mul_f64_e32_subrev_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MUL_F64_pseudo_e32_X_SUBREV_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 1, killed $vgpr6, 0, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MUL_F64_pseudo_e32 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_SUBREV_F32_e64 1, $vgpr6, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_min_num_f64_e64_mul_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_min_num_f64_e64_mul_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MIN_NUM_F64_e64 1, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_MUL_F32_e64 0, killed $vgpr6, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_min_num_f64_e64_mul_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MIN_NUM_F64_e32_X_MUL_F32_e32_e96_gfx1250 1, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr6, 1, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MIN_NUM_F64_e64 1, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_MUL_F32_e64 0, $vgpr6, 1, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_min_num_f64_e32_mul_legacy_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_min_num_f64_e32_mul_legacy_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MIN_NUM_F64_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_MUL_LEGACY_F32_e64 1, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_min_num_f64_e32_mul_legacy_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MIN_NUM_F64_e32_X_MUL_LEGACY_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 1, killed $vgpr6, 0, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MIN_NUM_F64_e32 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_MUL_LEGACY_F32_e64 1, $vgpr6, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_max_num_f64_e64_min_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_max_num_f64_e64_min_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MAX_NUM_F64_e64 0, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_MIN_F32_e64 1, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_max_num_f64_e64_min_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MAX_NUM_F64_e32_X_MIN_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 1, killed $vgpr2_vgpr3, 1, killed $vgpr6, 0, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MAX_NUM_F64_e64 0, $vgpr0_vgpr1, 1, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_MIN_F32_e64 1, $vgpr6, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_max_num_f64_e32_max_f32_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_max_num_f64_e32_max_f32_neg
+    ; SCHED: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr8_vgpr9 = V_MAX_NUM_F64_e32 $vgpr0_vgpr1, killed $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_MAX_F32_e64 0, killed $vgpr6, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_max_num_f64_e32_max_f32_neg
+    ; PAIR: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr8_vgpr9, $vgpr7 = V_DUAL_MAX_NUM_F64_e32_X_MAX_F32_e32_e96_gfx1250 0, $vgpr0_vgpr1, 0, killed $vgpr2_vgpr3, 0, killed $vgpr6, 1, killed $vgpr4, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr8_vgpr9 = V_MAX_NUM_F64_e32 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr7 = V_MAX_F32_e64 0, $vgpr6, 1, $vgpr4, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_add_f64_fmac_f32_e64_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_add_f64_fmac_f32_e64_neg
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr10_vgpr11, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_FMAC_F32_e64 0, $vgpr0, 1, $vgpr1, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 10, implicit $exec
+    ; SCHED-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_add_f64_fmac_f32_e64_neg
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr10_vgpr11 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr2_vgpr3, $vgpr6 = V_DUAL_ADD_F64_pseudo_e32_X_FMAC_F32_e32_e96_gfx1250 0, 10, 0, killed $vgpr10_vgpr11, 0, $vgpr0, 1, $vgpr1, killed $vgpr6, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 10, implicit $exec
+    ; PAIR-NEXT: $vgpr5 = V_BFM_B32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr10_vgpr11 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, $vgpr10_vgpr11, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_MOV_B64_e32 10, implicit $exec
+    $vgpr5 = V_BFM_B32_e64 $vgpr0, $vgpr1, implicit $exec
+    $vgpr6 = V_FMAC_F32_e64 0, $vgpr0, 1, $vgpr1, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec
+...
+
+---
+name:            vopd_combine_cndmask_e64_neg_cndmask_e64_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: vopd_combine_cndmask_e64_neg_cndmask_e64_neg
+    ; SCHED: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vcc = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_CNDMASK_B32_e64 1, killed $vgpr0, 0, killed $vgpr1, $vcc_lo, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e64 1, killed $vgpr3, 0, killed $vgpr4, killed $vcc_lo, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_combine_cndmask_e64_neg_cndmask_e64_neg
+    ; PAIR: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vcc = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6, $vgpr7 = V_DUAL_CNDMASK_B32_e32_X_CNDMASK_B32_e32_e96_gfx1250 1, killed $vgpr0, 0, killed $vgpr1, $vcc_lo, 1, killed $vgpr3, 0, killed $vgpr4, killed $vcc_lo, implicit $exec, implicit $exec, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr4 = IMPLICIT_DEF
+    $vcc = IMPLICIT_DEF
+    $vgpr6 = V_CNDMASK_B32_e64 1, $vgpr0, 0, $vgpr1, $vcc_lo, implicit $exec
+    $vgpr7 = V_CNDMASK_B32_e64 1, $vgpr3, 0, $vgpr4, $vcc_lo, implicit $exec
+...
+
+---
+name: vopd_no_combine_dpp
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; SCHED-LABEL: name: vopd_no_combine_dpp
+    ; SCHED: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; SCHED-NEXT: {{  $}}
+    ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; SCHED-NEXT: $vgpr0 = V_ADD_F32_e64_dpp killed $vgpr0, 0, killed $vgpr2, 0, killed $vgpr1, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: vopd_no_combine_dpp
+    ; PAIR: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; PAIR-NEXT: {{  $}}
+    ; PAIR-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+    ; PAIR-NEXT: $vgpr0 = V_ADD_F32_e64_dpp killed $vgpr0, 0, killed $vgpr2, 0, killed $vgpr1, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
+      $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+      $vgpr0 = V_ADD_F32_e64_dpp $vgpr0, 0, $vgpr2, 0, $vgpr1, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/ARM/min-max-combine.ll b/llvm/test/CodeGen/ARM/min-max-combine.ll
new file mode 100644
index 0000000000000..8cb0d79f5e339
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/min-max-combine.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=armv7a < %s | FileCheck %s --check-prefix=ARM
+; RUN: llc -mtriple=armv6m < %s | FileCheck %s --check-prefix=THUMB
+; RUN: llc -mtriple=armv7m < %s | FileCheck %s --check-prefix=THUMB2
+; RUN: llc -mtriple=thumbv8.1m.main < %s | FileCheck %s --check-prefix=THUMBV8
+
+declare i8 @llvm.smax.i8(i8 %a, i8 %b) readnone
+
+define i8 @smaxi8_zero(i8 %a) {
+; ARM-LABEL: smaxi8_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    sxtb r0, r0
+; ARM-NEXT:    bic r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smaxi8_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    sxtb r0, r0
+; THUMB-NEXT:    asrs r1, r0, #31
+; THUMB-NEXT:    bics r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smaxi8_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    sxtb r0, r0
+; THUMB2-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smaxi8_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    sxtb r0, r0
+; THUMBV8-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i8 @llvm.smax.i8(i8 %a, i8 0)
+  ret i8 %c
+}
+
+declare i16 @llvm.smax.i16(i16 %a, i16 %b) readnone
+
+define i16 @smaxi16_zero(i16 %a) {
+; ARM-LABEL: smaxi16_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    sxth r0, r0
+; ARM-NEXT:    bic r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smaxi16_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    sxth r0, r0
+; THUMB-NEXT:    asrs r1, r0, #31
+; THUMB-NEXT:    bics r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smaxi16_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    sxth r0, r0
+; THUMB2-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smaxi16_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    sxth r0, r0
+; THUMBV8-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i16 @llvm.smax.i16(i16 %a, i16 0)
+  ret i16 %c
+}
+
+declare i32 @llvm.smax.i32(i32 %a, i32 %b) readnone
+
+define i32 @smaxi32_zero(i32 %a) {
+; ARM-LABEL: smaxi32_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    bic r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smaxi32_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    asrs r1, r0, #31
+; THUMB-NEXT:    bics r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smaxi32_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smaxi32_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    bic.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i32 @llvm.smax.i32(i32 %a, i32 0)
+  ret i32 %c
+}
+
+; SMIN
+
+declare i8 @llvm.smin.i8(i8 %a, i8 %b) readnone
+
+define i8 @smini8_zero(i8 %a) {
+; ARM-LABEL: smini8_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    sxtb r0, r0
+; ARM-NEXT:    and r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smini8_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    sxtb r1, r0
+; THUMB-NEXT:    asrs r0, r1, #31
+; THUMB-NEXT:    ands r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smini8_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    sxtb r0, r0
+; THUMB2-NEXT:    and.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smini8_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    sxtb r0, r0
+; THUMBV8-NEXT:    and.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i8 @llvm.smin.i8(i8 %a, i8 0)
+  ret i8 %c
+}
+
+declare i16 @llvm.smin.i16(i16 %a, i16 %b) readnone
+
+define i16 @smini16_zero(i16 %a) {
+; ARM-LABEL: smini16_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    sxth r0, r0
+; ARM-NEXT:    and r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smini16_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    sxth r1, r0
+; THUMB-NEXT:    asrs r0, r1, #31
+; THUMB-NEXT:    ands r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smini16_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    sxth r0, r0
+; THUMB2-NEXT:    and.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smini16_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    sxth r0, r0
+; THUMBV8-NEXT:    and.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i16 @llvm.smin.i16(i16 %a, i16 0)
+  ret i16 %c
+}
+
+declare i32 @llvm.smin.i32(i32 %a, i32 %b) readnone
+
+define i32 @smini32_zero(i32 %a) {
+; ARM-LABEL: smini32_zero:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    and r0, r0, r0, asr #31
+; ARM-NEXT:    bx lr
+;
+; THUMB-LABEL: smini32_zero:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    asrs r1, r0, #31
+; THUMB-NEXT:    ands r0, r1
+; THUMB-NEXT:    bx lr
+;
+; THUMB2-LABEL: smini32_zero:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    and.w r0, r0, r0, asr #31
+; THUMB2-NEXT:    bx lr
+;
+; THUMBV8-LABEL: smini32_zero:
+; THUMBV8:       @ %bb.0:
+; THUMBV8-NEXT:    and.w r0, r0, r0, asr #31
+; THUMBV8-NEXT:    bx lr
+  %c = call i32 @llvm.smin.i32(i32 %a, i32 0)
+  ret i32 %c
+}
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_9.ll b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
index dd3114926bcf1..5ea55ef81d650 100644
--- a/llvm/test/CodeGen/BPF/remove_truncate_9.ll
+++ b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mcpu=v2 -mtriple=bpf < %s | FileCheck %s
-; RUN: llc -mcpu=v4 -mtriple=bpf < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=v2 -mtriple=bpf < %s | FileCheck %s --check-prefixes=CHECK-V2
+; RUN: llc -mcpu=v4 -mtriple=bpf < %s | FileCheck %s --check-prefixes=CHECK-V4
 
 ; Zero extension instructions should be eliminated at instruction
 ; selection phase for all test cases below.
@@ -9,10 +10,36 @@
 ; generated code (<<= remains because %c is used by both call and
 ; lshr in a few test cases).
 
-; CHECK-NOT: &=
-; CHECK-NOT: >>=
-
 define void @shl_lshr_same_bb(ptr %p) {
+; CHECK-V2-LABEL: shl_lshr_same_bb:
+; CHECK-V2:       # %bb.0: # %entry
+; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
+; CHECK-V2-NEXT:    r5 = 1
+; CHECK-V2-NEXT:    if r1 == 0 goto LBB0_2
+; CHECK-V2-NEXT:  # %bb.1: # %entry
+; CHECK-V2-NEXT:    r5 = 0
+; CHECK-V2-NEXT:  LBB0_2: # %entry
+; CHECK-V2-NEXT:    r3 = r1
+; CHECK-V2-NEXT:    r3 <<= 56
+; CHECK-V2-NEXT:    r2 = r1
+; CHECK-V2-NEXT:    r4 = r1
+; CHECK-V2-NEXT:    call sink1
+; CHECK-V2-NEXT:    exit
+;
+; CHECK-V4-LABEL: shl_lshr_same_bb:
+; CHECK-V4:       # %bb.0: # %entry
+; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
+; CHECK-V4-NEXT:    w5 = 1
+; CHECK-V4-NEXT:    if w1 == 0 goto LBB0_2
+; CHECK-V4-NEXT:  # %bb.1: # %entry
+; CHECK-V4-NEXT:    w5 = 0
+; CHECK-V4-NEXT:  LBB0_2: # %entry
+; CHECK-V4-NEXT:    r3 = r1
+; CHECK-V4-NEXT:    r3 <<= 56
+; CHECK-V4-NEXT:    r2 = r1
+; CHECK-V4-NEXT:    r4 = r1
+; CHECK-V4-NEXT:    call sink1
+; CHECK-V4-NEXT:    exit
 entry:
   %a = load i8, ptr %p, align 1
   %b = zext i8 %a to i64
@@ -26,6 +53,35 @@ entry:
 }
 
 define void @shl_lshr_diff_bb(ptr %p) {
+; CHECK-V2-LABEL: shl_lshr_diff_bb:
+; CHECK-V2:       # %bb.0: # %entry
+; CHECK-V2-NEXT:    r1 = *(u16 *)(r1 + 0)
+; CHECK-V2-NEXT:    r5 = 1
+; CHECK-V2-NEXT:    if r1 == 0 goto LBB1_2
+; CHECK-V2-NEXT:  # %bb.1: # %entry
+; CHECK-V2-NEXT:    r5 = 0
+; CHECK-V2-NEXT:  LBB1_2: # %entry
+; CHECK-V2-NEXT:    r3 = r1
+; CHECK-V2-NEXT:    r3 <<= 48
+; CHECK-V2-NEXT:    r2 = r1
+; CHECK-V2-NEXT:    r4 = r1
+; CHECK-V2-NEXT:    call sink2
+; CHECK-V2-NEXT:    exit
+;
+; CHECK-V4-LABEL: shl_lshr_diff_bb:
+; CHECK-V4:       # %bb.0: # %entry
+; CHECK-V4-NEXT:    w1 = *(u16 *)(r1 + 0)
+; CHECK-V4-NEXT:    w5 = 1
+; CHECK-V4-NEXT:    if w1 == 0 goto LBB1_2
+; CHECK-V4-NEXT:  # %bb.1: # %entry
+; CHECK-V4-NEXT:    w5 = 0
+; CHECK-V4-NEXT:  LBB1_2: # %entry
+; CHECK-V4-NEXT:    r3 = r1
+; CHECK-V4-NEXT:    r3 <<= 48
+; CHECK-V4-NEXT:    r2 = r1
+; CHECK-V4-NEXT:    r4 = r1
+; CHECK-V4-NEXT:    call sink2
+; CHECK-V4-NEXT:    exit
 entry:
   %a = load i16, ptr %p, align 2
   %b = zext i16 %a to i64
@@ -45,6 +101,27 @@ next:
 }
 
 define void @load_zext_same_bb(ptr %p) {
+; CHECK-V2-LABEL: load_zext_same_bb:
+; CHECK-V2:       # %bb.0: # %entry
+; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
+; CHECK-V2-NEXT:    r2 = 1
+; CHECK-V2-NEXT:    if r1 == 0 goto LBB2_2
+; CHECK-V2-NEXT:  # %bb.1: # %entry
+; CHECK-V2-NEXT:    r2 = 0
+; CHECK-V2-NEXT:  LBB2_2: # %entry
+; CHECK-V2-NEXT:    call sink3
+; CHECK-V2-NEXT:    exit
+;
+; CHECK-V4-LABEL: load_zext_same_bb:
+; CHECK-V4:       # %bb.0: # %entry
+; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
+; CHECK-V4-NEXT:    w2 = 1
+; CHECK-V4-NEXT:    if w1 == 0 goto LBB2_2
+; CHECK-V4-NEXT:  # %bb.1: # %entry
+; CHECK-V4-NEXT:    w2 = 0
+; CHECK-V4-NEXT:  LBB2_2: # %entry
+; CHECK-V4-NEXT:    call sink3
+; CHECK-V4-NEXT:    exit
 entry:
   %a = load i8, ptr %p, align 1
   ; zext is implicit in this context
@@ -54,6 +131,27 @@ entry:
 }
 
 define void @load_zext_diff_bb(ptr %p) {
+; CHECK-V2-LABEL: load_zext_diff_bb:
+; CHECK-V2:       # %bb.0: # %entry
+; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
+; CHECK-V2-NEXT:    r2 = 1
+; CHECK-V2-NEXT:    if r1 == 0 goto LBB3_2
+; CHECK-V2-NEXT:  # %bb.1: # %next
+; CHECK-V2-NEXT:    r2 = 0
+; CHECK-V2-NEXT:  LBB3_2: # %next
+; CHECK-V2-NEXT:    call sink3
+; CHECK-V2-NEXT:    exit
+;
+; CHECK-V4-LABEL: load_zext_diff_bb:
+; CHECK-V4:       # %bb.0: # %entry
+; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
+; CHECK-V4-NEXT:    w2 = 1
+; CHECK-V4-NEXT:    if w1 == 0 goto LBB3_2
+; CHECK-V4-NEXT:  # %bb.1: # %next
+; CHECK-V4-NEXT:    w2 = 0
+; CHECK-V4-NEXT:  LBB3_2: # %next
+; CHECK-V4-NEXT:    call sink3
+; CHECK-V4-NEXT:    exit
 entry:
   %a = load i8, ptr %p, align 1
   br label %next
@@ -65,6 +163,27 @@ next:
 }
 
 define void @load_zext_diff_bb_2(ptr %p) {
+; CHECK-V2-LABEL: load_zext_diff_bb_2:
+; CHECK-V2:       # %bb.0: # %entry
+; CHECK-V2-NEXT:    r1 = *(u32 *)(r1 + 0)
+; CHECK-V2-NEXT:    r2 = 1
+; CHECK-V2-NEXT:    if r1 == 0 goto LBB4_2
+; CHECK-V2-NEXT:  # %bb.1: # %next
+; CHECK-V2-NEXT:    r2 = 0
+; CHECK-V2-NEXT:  LBB4_2: # %next
+; CHECK-V2-NEXT:    call sink4
+; CHECK-V2-NEXT:    exit
+;
+; CHECK-V4-LABEL: load_zext_diff_bb_2:
+; CHECK-V4:       # %bb.0: # %entry
+; CHECK-V4-NEXT:    w1 = *(u32 *)(r1 + 0)
+; CHECK-V4-NEXT:    w2 = 1
+; CHECK-V4-NEXT:    if w1 == 0 goto LBB4_2
+; CHECK-V4-NEXT:  # %bb.1: # %next
+; CHECK-V4-NEXT:    w2 = 0
+; CHECK-V4-NEXT:  LBB4_2: # %next
+; CHECK-V4-NEXT:    call sink4
+; CHECK-V4-NEXT:    exit
 entry:
   %a = load i32, ptr %p, align 4
   br label %next
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll
new file mode 100644
index 0000000000000..736c86ebb1299
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; CHECK: ; Combined Shader Flags for Module
+; CHECK-NEXT: ; Shader Flags Value: 0x00000000
+; CHECK-NEXT: ;
+; CHECK-NOT:  ; Note: shader requires additional functionality:
+; CHECK-NOT:  ;       64-Bit integer
+; CHECK-NOT:  ; Note: extra DXIL module flags:
+; CHECK-NOT:  ;
+; CHECK-NEXT: ; Shader Flags for Module Functions
+; CHECK-NEXT: ; Function lifetimes : 0x00000000
+
+define void @lifetimes() #0 {
+  %a = alloca [4 x i32], align 8
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %a)
+  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %a)
+  ret void
+}
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64, ptr) #1
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64, ptr) #1
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
+attributes #1 = { nounwind memory(argmem: readwrite) }
+
+; DXC: - Name:            SFI0
+; DXC-NEXT:     Size:            8
+; DXC-NOT:     Flags:
+; DXC-NOT:         Int64Ops:        true
+; DXC: ...
diff --git a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
index 6552ccddddab4..f77df2d812dfe 100644
--- a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM63
 ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM66
+; RUN: opt -S -dxil-op-lower -dxil-prepare -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-PREPARE
 
 ; CHECK-LABEL: define void @test_legal_lifetime() {
 ; 
@@ -15,6 +16,14 @@
 ; CHECK-SM66-NEXT:    store i32 0, ptr [[GEP]], align 4
 ; CHECK-SM66-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
 ; 
+; CHECK-PREPARE-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
+; CHECK-PREPARE-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-PREPARE-NEXT:    [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
+; CHECK-PREPARE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[BITCAST]])
+; CHECK-PREPARE-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-PREPARE-NEXT:    [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
+; CHECK-PREPARE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[BITCAST]])
+; 
 ; CHECK-NEXT:    ret void
 ;
 define void @test_legal_lifetime()  {
@@ -26,6 +35,22 @@ define void @test_legal_lifetime()  {
   ret void
 }
 
+; CHECK-PREPARE-DAG: attributes [[LIFETIME_ATTRS:#.*]] = { nounwind }
+
+; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
+; CHECK-PREPARE-DAG: declare void @llvm.lifetime.start.p0(i64, ptr) [[LIFETIME_ATTRS]]
+
+; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
+; CHECK-PREPARE-DAG: declare void @llvm.lifetime.end.p0(i64, ptr) [[LIFETIME_ATTRS]]
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64, ptr) #0
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64, ptr) #0
+
+attributes #0 = { nounwind memory(argmem: readwrite) }
+
 ; Set the validator version to 1.6
 !dx.valver = !{!0}
 !0 = !{i32 1, i32 6}
diff --git a/llvm/test/CodeGen/Hexagon/addsat.ll b/llvm/test/CodeGen/Hexagon/addsat.ll
new file mode 100644
index 0000000000000..489c7d5a0fdff
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/addsat.ll
@@ -0,0 +1,157 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test for saturating add instructions.
+
+; CHECK-LABEL: test1
+; CHECK: v{{.*}}.ub = vadd(v{{[0-9]+}}.ub,v{{[0-9]+}}.ub):sat
+define <128 x i8> @test1(<128 x i8>* %a0, <128 x i8>* %a1) #0 {
+entry:
+  %wide.load = load <128 x i8>, <128 x i8>* %a0, align 1
+  %wide.load62 = load <128 x i8>, <128 x i8>* %a1, align 1
+  %add = call <128 x i8> @llvm.uadd.sat.v128i8(<128 x i8> %wide.load, <128 x i8> %wide.load62)
+  ret <128 x i8> %add
+}
+
+; CHECK-LABEL: test2
+; CHECK: v{{.*}}.b = vadd(v{{[0-9]+}}.b,v{{[0-9]+}}.b):sat
+define <128 x i8> @test2(<128 x i8>* %a0, <128 x i8>* %a1) #0 {
+entry:
+  %wide.load = load <128 x i8>, <128 x i8>* %a0, align 1
+  %wide.load62 = load <128 x i8>, <128 x i8>* %a1, align 1
+  %add = call <128 x i8> @llvm.sadd.sat.v128i8(<128 x i8> %wide.load, <128 x i8> %wide.load62)
+  ret <128 x i8> %add
+}
+
+; CHECK-LABEL: test3
+; CHECK: v{{.*}}.uh = vadd(v{{[0-9]+}}.uh,v{{[0-9]+}}.uh):sat
+define <64 x i16> @test3(<64 x i16>* %a0, <64 x i16>* %a1) #0 {
+entry:
+  %wide.load = load <64 x i16>, <64 x i16>* %a0, align 1
+  %wide.load62 = load <64 x i16>, <64 x i16>* %a1, align 1
+  %add = call <64 x i16> @llvm.uadd.sat.v64i16(<64 x i16> %wide.load, <64 x i16> %wide.load62)
+  ret <64 x i16> %add
+}
+
+; CHECK-LABEL: test4
+; CHECK: v{{.*}}.h = vadd(v{{[0-9]+}}.h,v{{[0-9]+}}.h):sat
+define <64 x i16> @test4(<64 x i16>* %a0, <64 x i16>* %a1) #0 {
+entry:
+  %wide.load = load <64 x i16>, <64 x i16>* %a0, align 1
+  %wide.load62 = load <64 x i16>, <64 x i16>* %a1, align 1
+  %add = call <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16> %wide.load, <64 x i16> %wide.load62)
+  ret <64 x i16> %add
+}
+
+; CHECK-LABEL: test5
+; CHECK: v{{.*}}.uw = vadd(v{{[0-9]+}}.uw,v{{[0-9]+}}.uw):sat
+define <32 x i32> @test5(<32 x i32>* %a0, <32 x i32>* %a1) #0 {
+entry:
+  %wide.load = load <32 x i32>, <32 x i32>* %a0, align 1
+  %wide.load62 = load <32 x i32>, <32 x i32>* %a1, align 1
+  %add = call <32 x i32> @llvm.uadd.sat.v32i32(<32 x i32> %wide.load, <32 x i32> %wide.load62)
+  ret <32 x i32> %add
+}
+
+; CHECK-LABEL: test6
+; CHECK: v{{.*}}.w = vadd(v{{[0-9]+}}.w,v{{[0-9]+}}.w):sat
+define <32 x i32> @test6(<32 x i32>* %a0, <32 x i32>* %a1) #0 {
+entry:
+  %wide.load = load <32 x i32>, <32 x i32>* %a0, align 1
+  %wide.load62 = load <32 x i32>, <32 x i32>* %a1, align 1
+  %add = call <32 x i32> @llvm.sadd.sat.v32i32(<32 x i32> %wide.load, <32 x i32> %wide.load62)
+  ret <32 x i32> %add
+}
+
+; CHECK-LABEL: test7
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.ub = vadd(v{{[0-9]+}}:{{[0-9]+}}.ub,v{{[0-9]+}}:{{[0-9]+}}.ub):sat
+define <256 x i8> @test7(<256 x i8>* %a0, <256 x i8>* %a1) #0 {
+entry:
+  %wide.load = load <256 x i8>, <256 x i8>* %a0, align 1
+  %wide.load62 = load <256 x i8>, <256 x i8>* %a1, align 1
+  %add = call <256 x i8> @llvm.uadd.sat.v256i8(<256 x i8> %wide.load, <256 x i8> %wide.load62)
+  ret <256 x i8> %add
+}
+
+; CHECK-LABEL: test8
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.b = vadd(v{{[0-9]+}}:{{[0-9]+}}.b,v{{[0-9]+}}:{{[0-9]+}}.b):sat
+define <256 x i8> @test8(<256 x i8>* %a0, <256 x i8>* %a1) #0 {
+entry:
+  %wide.load = load <256 x i8>, <256 x i8>* %a0, align 1
+  %wide.load62 = load <256 x i8>, <256 x i8>* %a1, align 1
+  %add = call <256 x i8> @llvm.sadd.sat.v256i8(<256 x i8> %wide.load, <256 x i8> %wide.load62)
+  ret <256 x i8> %add
+}
+
+; CHECK-LABEL: test9
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.uh = vadd(v{{[0-9]+}}:{{[0-9]+}}.uh,v{{[0-9]+}}:{{[0-9]+}}.uh):sat
+define <128 x i16> @test9(<128 x i16>* %a0, <128 x i16>* %a1) #0 {
+entry:
+  %wide.load = load <128 x i16>, <128 x i16>* %a0, align 1
+  %wide.load62 = load <128 x i16>, <128 x i16>* %a1, align 1
+  %add = call <128 x i16> @llvm.uadd.sat.v128i16(<128 x i16> %wide.load, <128 x i16> %wide.load62)
+  ret <128 x i16> %add
+}
+
+; CHECK-LABEL: test10
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.h = vadd(v{{[0-9]+}}:{{[0-9]+}}.h,v{{[0-9]+}}:{{[0-9]+}}.h):sat
+define <128 x i16> @test10(<128 x i16>* %a0, <128 x i16>* %a1) #0 {
+entry:
+  %wide.load = load <128 x i16>, <128 x i16>* %a0, align 1
+  %wide.load62 = load <128 x i16>, <128 x i16>* %a1, align 1
+  %add = call <128 x i16> @llvm.sadd.sat.v128i16(<128 x i16> %wide.load, <128 x i16> %wide.load62)
+  ret <128 x i16> %add
+}
+
+; CHECK-LABEL: test11
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.uw = vadd(v{{[0-9]+}}:{{[0-9]+}}.uw,v{{[0-9]+}}:{{[0-9]+}}.uw):sat
+define <64 x i32> @test11(<64 x i32>* %a0, <64 x i32>* %a1) #0 {
+entry:
+  %wide.load = load <64 x i32>, <64 x i32>* %a0, align 1
+  %wide.load62 = load <64 x i32>, <64 x i32>* %a1, align 1
+  %add = call <64 x i32> @llvm.uadd.sat.v64i32(<64 x i32> %wide.load, <64 x i32> %wide.load62)
+  ret <64 x i32> %add
+}
+
+; CHECK-LABEL: test12
+; CHECK: v{{[0-9]+}}:{{[0-9]+}}.w = vadd(v{{[0-9]+}}:{{[0-9]+}}.w,v{{[0-9]+}}:{{[0-9]+}}.w):sat
+define <64 x i32> @test12(<64 x i32>* %a0, <64 x i32>* %a1) #0 {
+entry:
+  %wide.load = load <64 x i32>, <64 x i32>* %a0, align 1
+  %wide.load62 = load <64 x i32>, <64 x i32>* %a1, align 1
+  %add = call <64 x i32> @llvm.sadd.sat.v64i32(<64 x i32> %wide.load, <64 x i32> %wide.load62)
+  ret <64 x i32> %add
+}
+
+; CHECK-LABEL: test13
+; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}},r{{[0-9]+}}):sat
+define i32 @test13(i32 %a0, i32 %a1) #0 {
+entry:
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a0, i32 %a1)
+  ret i32 %add
+}
+
+; CHECK-LABEL: test14
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = add(r{{[0-9]+}}:{{[0-9]+}},r{{[0-9]+}}:{{[0-9]+}}):sat
+define i64 @test14(i64 %a0, i64 %a1) #0 {
+entry:
+  %add = call i64 @llvm.sadd.sat.i64(i64 %a0, i64 %a1)
+  ret i64 %add
+}
+
+declare <128 x i8> @llvm.uadd.sat.v128i8(<128 x i8>, <128 x i8>) #1
+declare <128 x i8> @llvm.sadd.sat.v128i8(<128 x i8>, <128 x i8>) #1
+declare <64 x i16> @llvm.uadd.sat.v64i16(<64 x i16>, <64 x i16>) #1
+declare <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16>, <64 x i16>) #1
+declare <32 x i32> @llvm.uadd.sat.v32i32(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.sadd.sat.v32i32(<32 x i32>, <32 x i32>) #1
+declare <256 x i8> @llvm.uadd.sat.v256i8(<256 x i8>, <256 x i8>) #1
+declare <256 x i8> @llvm.sadd.sat.v256i8(<256 x i8>, <256 x i8>) #1
+declare <128 x i16> @llvm.uadd.sat.v128i16(<128 x i16>, <128 x i16>) #1
+declare <128 x i16> @llvm.sadd.sat.v128i16(<128 x i16>, <128 x i16>) #1
+declare <64 x i32> @llvm.uadd.sat.v64i32(<64 x i32>, <64 x i32>) #1
+declare <64 x i32> @llvm.sadd.sat.v64i32(<64 x i32>, <64 x i32>) #1
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare i64 @llvm.sadd.sat.i64(i64, i64)
+
+attributes #0 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length128b" }
+attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/Hexagon/bitcast-i32-to-v32i1.ll b/llvm/test/CodeGen/Hexagon/bitcast-i32-to-v32i1.ll
new file mode 100644
index 0000000000000..741589d3cde74
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bitcast-i32-to-v32i1.ll
@@ -0,0 +1,20 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+
+; CHECK: [[VREG1:v([0-9]+)]] = vsplat(r{{[0-9]*}})
+; CHECK: [[VREG2:v([0-9]+)]] = vand([[VREG1]],v{{[0-9]+}})
+; CHECK: q[[QREG:[0-9]+]] =  vand([[VREG2]],r{{[0-9]+}})
+
+define void @bitcast_i32_to_v32i1_full(ptr %in, ptr %out) {
+entry:
+  %load = load i32, ptr %in, align 4
+  %bitcast = bitcast i32 %load to <32 x i1>
+  %e0 = extractelement <32 x i1> %bitcast, i32 0
+  %e1 = extractelement <32 x i1> %bitcast, i32 1
+  %z0 = zext i1 %e0 to i8
+  %z1 = zext i1 %e1 to i8
+  %ptr0 = getelementptr i8, ptr %out, i32 0
+  %ptr1 = getelementptr i8, ptr %out, i32 1
+  store i8 %z0, ptr %ptr0, align 1
+  store i8 %z1, ptr %ptr1, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/bitcast-v2i16-to-v32i1.ll b/llvm/test/CodeGen/Hexagon/bitcast-v2i16-to-v32i1.ll
new file mode 100644
index 0000000000000..45068e8e080b8
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bitcast-v2i16-to-v32i1.ll
@@ -0,0 +1,16 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+
+; CHECK: [[REG0:r[0-9]+]] = memw(r{{[0-9]+}}+#0)
+; CHECK: [[VREG1:v([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG2:v([0-9]+)]] = vand([[VREG1]],v{{[0-9]+}})
+; CHECK: q[[QREG:[0-9]+]] =  vand([[VREG2]],r{{[0-9]+}})
+
+define void @bitcast_v2i16_to_v32i1(ptr %in, ptr %out) {
+entry:
+  %load = load <2 x i16>, ptr %in, align 4
+  %bitcast = bitcast <2 x i16> %load to <32 x i1>
+  %extract = extractelement <32 x i1> %bitcast, i32 0
+  %zext = zext i1 %extract to i8
+  store i8 %zext, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/bitcast-v4i8-to-v32i1.ll b/llvm/test/CodeGen/Hexagon/bitcast-v4i8-to-v32i1.ll
new file mode 100644
index 0000000000000..15219332856c5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bitcast-v4i8-to-v32i1.ll
@@ -0,0 +1,16 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+
+; CHECK: [[REG0:r[0-9]+]] = memw(r{{[0-9]+}}+#0)
+; CHECK: [[VREG1:v([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG2:v([0-9]+)]] = vand([[VREG1]],v{{[0-9]+}})
+; CHECK: q[[QREG:[0-9]+]] =  vand([[VREG2]],r{{[0-9]+}})
+
+define void @bitcast_v4i8_to_v32i1(ptr %in, ptr %out) {
+entry:
+  %load = load <4 x i8>, ptr %in, align 4
+  %bitcast = bitcast <4 x i8> %load to <32 x i1>
+  %extract = extractelement <32 x i1> %bitcast, i32 0
+  %zext = zext i1 %extract to i8
+  store i8 %zext, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1.ll b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1.ll
index 1090b64fcad52..c91f16d91d1be 100644
--- a/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/trunc-vNi1.ll
@@ -1,24 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
 
 define void @f0(<2 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: f0:
-; CHECK:         .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %b0
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r5:4 = combine(#1,#1)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r1:0 = and(r1:0,r5:4)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = vcmpw.eq(r1:0,#1)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = p0
-; CHECK-NEXT:     jumpr r31
-; CHECK-NEXT:     memb(r2+#0) = r0.new
-; CHECK-NEXT:    }
+; CHECK: r[[REG1H:([0-9]+)]]:[[REG1L:([0-9]+)]] = combine(#1,#1)
+; CHECK: r[[REG2H:([0-9]+)]]:[[REG2L:([0-9]+)]] = and(r[[REG2H]]:[[REG2L]],r[[REG1H]]:[[REG1L]])
+; CHECK: p{{[0-9]+}} = vcmpw.eq(r[[REG2H]]:[[REG2L]],#1)
 b0:
   %v0 = trunc <2 x i32> %a0 to <2 x i1>
   store <2 x i1> %v0, ptr %a1, align 1
@@ -27,20 +14,9 @@ b0:
 
 define void @f1(<4 x i16> %a0, ptr %a1) {
 ; CHECK-LABEL: f1:
-; CHECK:         .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %b0
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = and(r0,##65537)
-; CHECK-NEXT:     r1 = and(r1,##65537)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = vcmph.eq(r1:0,#1)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = p0
-; CHECK-NEXT:     jumpr r31
-; CHECK-NEXT:     memb(r2+#0) = r0.new
-; CHECK-NEXT:    }
+; CHECK: [[REG0:r([0-9]+)]] = and([[REG0]],##65537)
+; CHECK: [[REG1:r([0-9]+)]] = and([[REG1]],##65537)
+; CHECK: p{{[0-9]+}} = vcmph.eq(r{{[0-9]+}}:{{[0-9]+}},#1)
 b0:
   %v0 = trunc <4 x i16> %a0 to <4 x i1>
   store <4 x i1> %v0, ptr %a1, align 1
@@ -49,22 +25,35 @@ b0:
 
 define void @f2(<8 x i8> %a0, ptr %a1) {
 ; CHECK-LABEL: f2:
-; CHECK:         .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %b0
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = and(r0,##16843009)
-; CHECK-NEXT:     r1 = and(r1,##16843009)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     p0 = vcmpb.eq(r1:0,#1)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = p0
-; CHECK-NEXT:     jumpr r31
-; CHECK-NEXT:     memb(r2+#0) = r0.new
-; CHECK-NEXT:    }
+; CHECK: [[REG0:r([0-9]+)]] = and([[REG0]],##16843009)
+; CHECK: [[REG1:r([0-9]+)]] = and([[REG1]],##16843009)
+; CHECK: p{{[0-9]+}} = vcmpb.eq(r{{[0-9]+}}:{{[0-9]+}},#1)
 b0:
   %v0 = trunc <8 x i8> %a0 to <8 x i1>
   store <8 x i1> %v0, ptr %a1, align 1
   ret void
 }
+
+define void @f3(<4 x i8> %a0, ptr %a1) {
+; CHECK-LABEL: f3:
+; CHECK: r[[REGH:([0-9]+)]]:[[REGL:([0-9]+)]] = vzxtbh(r{{[0-9]+}})
+; CHECK: r[[REGL]] = and(r[[REGL]],##65537)
+; CHECK: r[[REGH]] = and(r[[REGH]],##65537)
+; CHECK: p{{[0-9]+}} = vcmph.eq(r[[REGH]]:[[REGL]],#1)
+b0:
+  %v0 = trunc <4 x i8> %a0 to <4 x i1>
+  store <4 x i1> %v0, ptr %a1, align 1
+  ret void
+}
+
+define void @f4(<2 x i16> %a0, ptr %a1) {
+; CHECK-LABEL: f4:
+; CHECK: r[[REGH:([0-9]+)]]:[[REGL:([0-9]+)]] = vzxthw(r{{[0-9]+}})
+; CHECK: r[[REG1H:([0-9]+)]]:[[REG1L:([0-9]+)]] = combine(#1,#1)
+; CHECK: r[[REGH]]:[[REGL]] = and(r[[REGH]]:[[REGL]],r[[REG1H]]:[[REG1L]])
+; CHECK: p{{[0-9]+}} = vcmpw.eq(r[[REGH]]:[[REGL]],#1)
+b0:
+  %v0 = trunc <2 x i16> %a0 to <2 x i1>
+  store <2 x i1> %v0, ptr %a1, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
index 17ee07f49324a..7182e0a112560 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
@@ -14,16 +14,14 @@
 # ```
 # 
 # Loop-carried dependencies exist from store for a[i+1] to load/store for a[i], but not vice versa.
-# FIXME: Currently the following dependencies are missed.
-#
-# Loop carried edges from SU(6)
-#   Order
-#     SU(4)
-# Loop carried edges from SU(8)
-#   Order
-#     SU(4)
 
 # CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT:   Loop carried edges from SU(6)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT:   Loop carried edges from SU(8)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(4)
 # CHECK-NEXT: ===== Loop Carried Edges End =====
 
 --- |
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
index 850e602c9146f..56485e04ad35c 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
@@ -14,16 +14,14 @@
 # ```
 # 
 # Loop-carried dependencies exist from load/store for a[i] to store for a[i-1], but not vice versa.
-# FIXME: Currently the following dependencies are missed.
-#
-#  Loop carried edges from SU(5)
-#    Order
-#      SU(7)
 
 # CHECK:      ===== Loop Carried Edges Begin =====
 # CHECK-NEXT:   Loop carried edges from SU(3)
 # CHECK-NEXT:     Order
 # CHECK-NEXT:       SU(7)
+# CHECK-NEXT:   Loop carried edges from SU(5)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(7)
 # CHECK-NEXT: ===== Loop Carried Edges End =====
 
 --- |
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
index ca59b97dd11e9..69f56fa7934f2 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
@@ -14,13 +14,11 @@
 # ```
 # 
 # Loop-carried dependencies exist from load for a[i+1] to store for a[i].
-# FIXME: Currently the following dependencies are missed.
-#
-# Loop carried edges from SU(7)
-#   Order
-#     SU(5)
 
 # CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT:   Loop carried edges from SU(7)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(5)
 # CHECK-NEXT: ===== Loop Carried Edges End =====
 
 --- |
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
index 4bc4b48735947..cc4e9e1d67c5c 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
@@ -20,15 +20,15 @@
 #
 # FIXME: Currently the following dependencies are missed.
 #
-# Loop carried edges from SU(4)
-#   Order
-#     SU(3)
 
 # CHECK:      ===== Loop Carried Edges Begin =====
 # CHECK-NEXT:   Loop carried edges from SU(2)
 # CHECK-NEXT:     Order
 # CHECK-NEXT:       SU(3)
 # CHECK-NEXT:       SU(4)
+# CHECK-NEXT:   Loop carried edges from SU(4)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
 # CHECK-NEXT: ===== Loop Carried Edges End =====
 
 --- |
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
index 77c3d569db181..3c2e0c40680c8 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
@@ -23,20 +23,18 @@
 # Note that if there is already a dependency between two instructions, we don't
 # add loop-carried on between them since non-loop-carried one imposes stronger
 # constraint than loop-carried one.
-#
-# FIXME: Currently the following dependencies are missed.
-#  Loop carried edges from SU(5)
-#    Order
-#      SU(2)
-#  Loop carried edges from SU(6)
-#    Order
-#      SU(5)
-#  Loop carried edges from SU(8)
-#    Order
-#      SU(3)
-#      SU(5)
 
 # CHECK:      ===== Loop Carried Edges Begin =====
+# CHECK-NEXT:   Loop carried edges from SU(5)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(2)
+# CHECK-NEXT:   Loop carried edges from SU(6)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(5)
+# CHECK-NEXT:   Loop carried edges from SU(8)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT:       SU(5)
 # CHECK-NEXT: ===== Loop Carried Edges End =====
 
 --- |
diff --git a/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll b/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll
new file mode 100644
index 0000000000000..231e82a6d53ac
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/concat-vectors.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define <32 x i8> @concat_poison_v32i8_1(<16 x i8> %a) {
+; CHECK-LABEL: concat_poison_v32i8_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i8> %a, <16 x i8> poison,
+                     <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                 i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                 i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @concat_poison_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: concat_poison_v32i8_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i8> %b, <16 x i8> poison,
+                     <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                 i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                 i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @concat_vectors_v32i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: concat_vectors_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b,
+                     <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                 i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                 i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %1
+}
+
+define <16 x i16> @concat_poison_v16i16_1(<8 x i16> %a) {
+; CHECK-LABEL: concat_poison_v16i16_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i16> %a, <8 x i16> poison,
+                     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @concat_poison_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: concat_poison_v16i16_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i16> %b, <8 x i16> poison,
+                     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @concat_vectors_v16i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: concat_vectors_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b,
+                     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %1
+}
+
+define <8 x i32> @concat_poison_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: concat_poison_v8i32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i32> %a, <4 x i32> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @concat_poison_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: concat_poison_v8i32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i32> %b, <4 x i32> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @concat_vectors_v8i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: concat_vectors_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %1
+}
+
+define <8 x float> @concat_poison_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: concat_poison_v8f32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x float> %a, <4 x float> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %1
+}
+
+define <8 x float> @concat_poison_v8f32_2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: concat_poison_v8f32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x float> %b, <4 x float> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %1
+}
+
+define <8 x float> @concat_vectors_v8f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: concat_vectors_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x float> %a, <4 x float> %b,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %1
+}
+
+define <4 x i64> @concat_poison_v8i64_1(<2 x i64> %a) {
+; CHECK-LABEL: concat_poison_v8i64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @concat_poison_v8i64_2(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: concat_poison_v8i64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x i64> %b, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @concat_vectors_v8i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: concat_vectors_v8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i64> %1
+}
+
+define <4 x double> @concat_poison_v8f64_1(<2 x double> %a) {
+; CHECK-LABEL: concat_poison_v8f64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x double> %a, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %1
+}
+
+define <4 x double> @concat_poison_v8f64_2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: concat_poison_v8f64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %1
+}
+
+define <4 x double> @concat_vectors_v8f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: concat_vectors_v8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll b/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll
new file mode 100644
index 0000000000000..7a90afca376db
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/insert-extract-subvector.ll
@@ -0,0 +1,668 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
+
+define <8 x i32> @insert_lo128_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: insert_lo128_v8i32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %a, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_1(<4 x i32> %a) {
+; CHECK-LABEL: insert_hi128_v8i32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %a, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_lo128_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_lo128_v8i32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %b, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_hi128_v8i32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> %b, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_lo128_v8i32_3(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_lo128_v8i32_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %a, <4 x i32> %b, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @insert_hi128_v8i32_3(<8 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: insert_hi128_v8i32_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %a, <4 x i32> %b, i64 4)
+  ret <8 x i32> %1
+}
+
+declare <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float>, <4 x float>, i64)
+
+define <8 x float> @insert_lo128_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: insert_lo128_v8f32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> %a, i64 0)
+  ret <8 x float> %1
+}
+
+define <8 x float> @insert_hi128_v8f32_1(<4 x float> %a) {
+; CHECK-LABEL: insert_hi128_v8f32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> %a, i64 4)
+  ret <8 x float> %1
+}
+
+define <8 x float> @insert_lo128_v8f32_2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: insert_lo128_v8f32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> %b, i64 0)
+  ret <8 x float> %1
+}
+
+define <8 x float> @insert_hi128_v8f32_2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: insert_hi128_v8f32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> %b, i64 4)
+  ret <8 x float> %1
+}
+
+define <8 x float> @insert_lo128_v8f32_3(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: insert_lo128_v8f32_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> %a, <4 x float> %b, i64 0)
+  ret <8 x float> %1
+}
+
+define <8 x float> @insert_hi128_v8f32_3(<8 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: insert_hi128_v8f32_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <8 x float> @llvm.experimental.vector.insert.v8f32.v4f32(<8 x float> %a, <4 x float> %b, i64 4)
+  ret <8 x float> %1
+}
+
+declare <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64>, <2 x i64>, i64)
+
+define <4 x i64> @insert_lo128_v4i64_1(<2 x i64> %a) {
+; CHECK-LABEL: insert_lo128_v4i64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> %a, i64 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @insert_hi128_v4i64_1(<2 x i64> %a) {
+; CHECK-LABEL: insert_hi128_v4i64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> %a, i64 2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @insert_lo128_v4i64_2(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: insert_lo128_v4i64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> %b, i64 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @insert_hi128_v4i64_2(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: insert_hi128_v4i64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> %b, i64 2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @insert_lo128_v4i64_3(<4 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: insert_lo128_v4i64_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> %a, <2 x i64> %b, i64 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @insert_hi128_v4i64_3(<4 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: insert_hi128_v4i64_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x i64> @llvm.experimental.vector.insert.v4i64.v2i64(<4 x i64> %a, <2 x i64> %b, i64 2)
+  ret <4 x i64> %1
+}
+
+declare <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double>, <2 x double>, i64)
+
+define <4 x double> @insert_lo128_v4f64_1(<2 x double> %a) {
+; CHECK-LABEL: insert_lo128_v4f64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> %a, i64 0)
+  ret <4 x double> %1
+}
+
+define <4 x double> @insert_hi128_v4f64_1(<2 x double> %a) {
+; CHECK-LABEL: insert_hi128_v4f64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> %a, i64 2)
+  ret <4 x double> %1
+}
+
+define <4 x double> @insert_lo128_v4f64_2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: insert_lo128_v4f64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> %b, i64 0)
+  ret <4 x double> %1
+}
+
+define <4 x double> @insert_hi128_v4f64_2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: insert_hi128_v4f64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> %b, i64 2)
+  ret <4 x double> %1
+}
+
+define <4 x double> @insert_lo128_v4f64_3(<4 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: insert_lo128_v4f64_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> %a, <2 x double> %b, i64 0)
+  ret <4 x double> %1
+}
+
+define <4 x double> @insert_hi128_v4f64_3(<4 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: insert_hi128_v4f64_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <4 x double> @llvm.experimental.vector.insert.v4f64.v2f64(<4 x double> %a, <2 x double> %b, i64 2)
+  ret <4 x double> %1
+}
+
+declare <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16>, <8 x i16>, i64)
+
+define <16 x i16> @insert_lo128_v16i16_1(<8 x i16> %a) {
+; CHECK-LABEL: insert_lo128_v16i16_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> %a, i64 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @insert_hi128_v16i16_1(<8 x i16> %a) {
+; CHECK-LABEL: insert_hi128_v16i16_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> %a, i64 8)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @insert_lo128_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: insert_lo128_v16i16_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> %b, i64 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @insert_hi128_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: insert_hi128_v16i16_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> %b, i64 8)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @insert_lo128_v16i16_3(<16 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: insert_lo128_v16i16_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> %a, <8 x i16> %b, i64 0)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @insert_hi128_v16i16_3(<16 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: insert_hi128_v16i16_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <16 x i16> @llvm.experimental.vector.insert.v16i16.v8i16(<16 x i16> %a, <8 x i16> %b, i64 8)
+  ret <16 x i16> %1
+}
+
+declare <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8>, <16 x i8>, i64)
+
+define <32 x i8> @insert_lo128_v32i8_1(<16 x i8> %a) {
+; CHECK-LABEL: insert_lo128_v32i8_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> %a, i64 0)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @insert_hi128_v32i8_1(<16 x i8> %a) {
+; CHECK-LABEL: insert_hi128_v32i8_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> %a, i64 16)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @insert_lo128_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: insert_lo128_v32i8_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> %b, i64 0)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @insert_hi128_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: insert_hi128_v32i8_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> %b, i64 16)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @insert_lo128_v32i8_3(<32 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: insert_lo128_v32i8_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> %a, <16 x i8> %b, i64 0)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @insert_hi128_v32i8_3(<32 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: insert_hi128_v32i8_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %1 = call <32 x i8> @llvm.experimental.vector.insert.v32i8.v16i8(<32 x i8> %a, <16 x i8> %b, i64 16)
+  ret <32 x i8> %1
+}
+
+define <4 x i32> @extract_lo128_v8i32_1(<8 x i32> %a) {
+; CHECK-LABEL: extract_lo128_v8i32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @extract_hi128_v8i32_1(<8 x i32> %a) {
+; CHECK-LABEL: extract_hi128_v8i32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @extract_lo128_v8i32_2(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: extract_lo128_v8i32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @extract_hi128_v8i32_2(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: extract_hi128_v8i32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %1
+}
+
+define <4 x float> @extract_lo128_v8f32_1(<8 x float> %a) {
+; CHECK-LABEL: extract_lo128_v8f32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %1
+}
+
+define <4 x float> @extract_hi128_v8f32_1(<8 x float> %a) {
+; CHECK-LABEL: extract_hi128_v8f32_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x float> %1
+}
+
+define <4 x float> @extract_lo128_v8f32_2(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: extract_lo128_v8f32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %1
+}
+
+define <4 x float> @extract_hi128_v8f32_2(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: extract_hi128_v8f32_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x float> %1
+}
+
+define <2 x i64> @extract_lo128_v4i64_1(<4 x i64> %a) {
+; CHECK-LABEL: extract_lo128_v4i64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i64> %a, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @extract_hi128_v4i64_1(<4 x i64> %a) {
+; CHECK-LABEL: extract_hi128_v4i64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i64> %a, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @extract_lo128_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: extract_lo128_v4i64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @extract_hi128_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: extract_hi128_v4i64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  ret <2 x i64> %1
+}
+
+define <2 x double> @extract_lo128_v4f64_a(<4 x double> %a) {
+; CHECK-LABEL: extract_lo128_v4f64_a:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %1
+}
+
+define <2 x double> @extract_hi128_v4f64_1(<4 x double> %a) {
+; CHECK-LABEL: extract_hi128_v4f64_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 2, i32 3>
+  ret <2 x double> %1
+}
+
+define <2 x double> @extract_lo128_v4f64_2(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: extract_lo128_v4f64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %1
+}
+
+define <2 x double> @extract_hi128_v4f64_2(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: extract_hi128_v4f64_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 2, i32 3>
+  ret <2 x double> %1
+}
+
+define <8 x i16> @extract_lo128_v16i16_1(<16 x i16> %a) {
+; CHECK-LABEL: extract_lo128_v16i16_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i16> %a, <16 x i16> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @extract_hi128_v16i16_1(<16 x i16> %a) {
+; CHECK-LABEL: extract_hi128_v16i16_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i16> %a, <16 x i16> poison,
+                     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @extract_lo128_v16i16_2(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: extract_lo128_v16i16_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i16> %b, <16 x i16> poison,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @extract_hi128_v16i16_2(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: extract_hi128_v16i16_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i16> %b, <16 x i16> poison,
+                     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @extract_lo128_v32i8_1(<32 x i8> %a) {
+; CHECK-LABEL: extract_lo128_v32i8_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i8> %a, <32 x i8> poison,
+                     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @extract_hi128_v32i8_1(<32 x i8> %a) {
+; CHECK-LABEL: extract_hi128_v32i8_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i8> %a, <32 x i8> poison,
+                     <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                 i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @extract_lo128_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: extract_lo128_v32i8_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i8> %b, <32 x i8> poison,
+                     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @extract_hi128_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: extract_hi128_v32i8_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i8> %b, <32 x i8> poison,
+                     <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                 i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/issue107355.ll b/llvm/test/CodeGen/LoongArch/lasx/issue107355.ll
index 818bd4311615d..506b5c1232f25 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/issue107355.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/issue107355.ll
@@ -18,10 +18,10 @@ define void @foo() {
 ; CHECK-NEXT:    ld.d $a3, $a3, %got_pc_lo12(g_813)
 ; CHECK-NEXT:    st.w $zero, $a1, 0
 ; CHECK-NEXT:    st.w $a2, $a3, 0
+; CHECK-NEXT:    xvrepli.b $xr0, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    vrepli.b $vr0, 0
 ; CHECK-NEXT:    vst $vr0, $a0, 32
-; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 2
-; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    st.w $zero, $a0, 20
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index 1c8f019922e37..7f52e5293d964 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -7,57 +8,105 @@ declare [2 x float] @bara([2 x float] %input)
 declare {float, float} @bars({float, float} %input)
 
 define void @test_v2f32(<2 x float> %input, ptr %output) {
-; CHECK-LABEL: @test_v2f32
+; CHECK-LABEL: test_v2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), barv, (param0);
+; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    ld.param.b64 %rd4, [test_v2f32_param_1];
+; CHECK-NEXT:    st.b64 [%rd4], %rd2;
+; CHECK-NEXT:    ret;
   %call = tail call <2 x float> @barv(<2 x float> %input)
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0];
   store <2 x float> %call, ptr %output, align 8
-; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
   ret void
 }
 
 define void @test_v3f32(<3 x float> %input, ptr %output) {
-; CHECK-LABEL: @test_v3f32
-;
+; CHECK-LABEL: test_v3f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_v3f32_param_0+8];
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 16 .b8 param0[16];
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
+; CHECK-NEXT:    st.param.b32 [param0+8], %r3;
+; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
+; CHECK-NEXT:    call.uni (retval0), barv3, (param0);
+; CHECK-NEXT:    ld.param.v2.b32 {%r4, %r5}, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r6, [retval0+8];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_1];
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r5};
+; CHECK-NEXT:    st.b32 [%rd1+8], %r6;
+; CHECK-NEXT:    ret;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0];
-; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [retval0+8];
 ; Make sure we don't load more values than than we need to.
-; CHECK-NOT: ld.param.b32 [[E3:%r[0-9]+]], [retval0+12];
   store <3 x float> %call, ptr %output, align 8
-; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
-; -- This is suboptimal. We should do st.v2.f32 instead
-;    of combining 2xf32 info i64.
-; CHECK-DAG: st.b64 [{{%rd[0-9]}}],
-; CHECK: ret;
   ret void
 }
 
 define void @test_a2f32([2 x float] %input, ptr %output) {
-; CHECK-LABEL: @test_a2f32
+; CHECK-LABEL: test_a2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_a2f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_a2f32_param_0+4];
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[8];
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
+; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), bara, (param0);
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r4, [retval0+4];
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_a2f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+4], %r4;
+; CHECK-NEXT:    st.b32 [%rd1], %r3;
+; CHECK-NEXT:    ret;
   %call = tail call [2 x float] @bara([2 x float] %input)
-; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMA1:%r[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMA2:%r[0-9]+]], [retval0+4];
   store [2 x float] %call, ptr %output, align 4
-; CHECK: }
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]]
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
   ret void
-; CHECK: ret
 }
 
 define void @test_s2f32({float, float} %input, ptr %output) {
-; CHECK-LABEL: @test_s2f32
+; CHECK-LABEL: test_s2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_s2f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_s2f32_param_0+4];
+; CHECK-NEXT:    { // callseq 3, 0
+; CHECK-NEXT:    .param .align 4 .b8 param0[8];
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
+; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
+; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), bars, (param0);
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %r4, [retval0+4];
+; CHECK-NEXT:    } // callseq 3
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s2f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+4], %r4;
+; CHECK-NEXT:    st.b32 [%rd1], %r3;
+; CHECK-NEXT:    ret;
   %call = tail call {float, float} @bars({float, float} %input)
-; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.b32 [[ELEMS1:%r[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.b32 [[ELEMS2:%r[0-9]+]], [retval0+4];
   store {float, float} %call, ptr %output, align 4
-; CHECK: }
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]]
-; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
   ret void
-; CHECK: ret
 }
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index a386e4292777b..aee58a044a986 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM70-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs8;
+; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs2;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
-; SM70-NEXT:    cvt.u32.u16 %r7, %rs7;
+; SM70-NEXT:    cvt.u32.u16 %r7, %rs1;
 ; SM70-NEXT:    shl.b32 %r8, %r7, 16;
-; SM70-NEXT:    cvt.u32.u16 %r9, %rs6;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r9, %rs4;
 ; SM70-NEXT:    shl.b32 %r10, %r9, 16;
-; SM70-NEXT:    cvt.u32.u16 %r11, %rs5;
+; SM70-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; SM70-NEXT:    shl.b32 %r12, %r11, 16;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs4;
+; SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs6;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs5;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
-; SM70-NEXT:    cvt.u32.u16 %r17, %rs2;
+; SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; SM70-NEXT:    cvt.u32.u16 %r17, %rs8;
 ; SM70-NEXT:    shl.b32 %r18, %r17, 16;
-; SM70-NEXT:    cvt.u32.u16 %r19, %rs1;
+; SM70-NEXT:    cvt.u32.u16 %r19, %rs7;
 ; SM70-NEXT:    shl.b32 %r20, %r19, 16;
 ; SM70-NEXT:    st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14};
 ; SM70-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6};
@@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM80-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM80-NEXT:    cvt.f32.bf16 %r5, %rs8;
-; SM80-NEXT:    cvt.f32.bf16 %r6, %rs7;
-; SM80-NEXT:    cvt.f32.bf16 %r7, %rs6;
-; SM80-NEXT:    cvt.f32.bf16 %r8, %rs5;
-; SM80-NEXT:    cvt.f32.bf16 %r9, %rs4;
-; SM80-NEXT:    cvt.f32.bf16 %r10, %rs3;
-; SM80-NEXT:    cvt.f32.bf16 %r11, %rs2;
-; SM80-NEXT:    cvt.f32.bf16 %r12, %rs1;
+; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-NEXT:    cvt.f32.bf16 %r5, %rs2;
+; SM80-NEXT:    cvt.f32.bf16 %r6, %rs1;
+; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM80-NEXT:    cvt.f32.bf16 %r7, %rs4;
+; SM80-NEXT:    cvt.f32.bf16 %r8, %rs3;
+; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; SM80-NEXT:    cvt.f32.bf16 %r9, %rs6;
+; SM80-NEXT:    cvt.f32.bf16 %r10, %rs5;
+; SM80-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; SM80-NEXT:    cvt.f32.bf16 %r11, %rs8;
+; SM80-NEXT:    cvt.f32.bf16 %r12, %rs7;
 ; SM80-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM80-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM80-NEXT:    ret;
@@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM80-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs1;
+; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs4;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs3;
+; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs6;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs5;
+; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs8;
+; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs7;
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM80-FTZ-NEXT:    ret;
@@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM90-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; SM90-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM90-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM90-NEXT:    cvt.f32.bf16 %r5, %rs8;
-; SM90-NEXT:    cvt.f32.bf16 %r6, %rs7;
-; SM90-NEXT:    cvt.f32.bf16 %r7, %rs6;
-; SM90-NEXT:    cvt.f32.bf16 %r8, %rs5;
-; SM90-NEXT:    cvt.f32.bf16 %r9, %rs4;
-; SM90-NEXT:    cvt.f32.bf16 %r10, %rs3;
-; SM90-NEXT:    cvt.f32.bf16 %r11, %rs2;
-; SM90-NEXT:    cvt.f32.bf16 %r12, %rs1;
+; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; SM90-NEXT:    cvt.f32.bf16 %r5, %rs2;
+; SM90-NEXT:    cvt.f32.bf16 %r6, %rs1;
+; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM90-NEXT:    cvt.f32.bf16 %r7, %rs4;
+; SM90-NEXT:    cvt.f32.bf16 %r8, %rs3;
+; SM90-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; SM90-NEXT:    cvt.f32.bf16 %r9, %rs6;
+; SM90-NEXT:    cvt.f32.bf16 %r10, %rs5;
+; SM90-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; SM90-NEXT:    cvt.f32.bf16 %r11, %rs8;
+; SM90-NEXT:    cvt.f32.bf16 %r12, %rs7;
 ; SM90-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM90-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM90-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index ba5813c869236..e2a914d8cfc36 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -359,12 +359,11 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x bfloat>
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
index 09dbe91d07513..cf166f83fb241 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
@@ -24,8 +24,8 @@ define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch)
 ; CHECK-PTX-NEXT:  // %bb.0:
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1];
-; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2];
+; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2;
 ; CHECK-PTX-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 %ch, i1 0)
@@ -44,8 +44,8 @@ define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3];
-; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4];
+; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2;
 ; CHECK-PTX-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0)
@@ -66,8 +66,8 @@ define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4];
-; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6];
+; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2;
 ; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1};
@@ -95,8 +95,8 @@ define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5];
-; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8];
+; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2;
 ; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6];
 ; CHECK-PTX-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7];
@@ -126,8 +126,8 @@ define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5];
 ; CHECK-PTX-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6];
-; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}];
 ; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10];
+; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2;
 ; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7];
 ; CHECK-PTX-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8];
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
index 5998883f77ac1..3b5bd161896bc 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
@@ -27,8 +27,8 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -41,8 +41,8 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 0)
@@ -62,8 +62,8 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -77,8 +77,8 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0)
@@ -99,8 +99,8 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
@@ -117,8 +117,8 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
@@ -145,8 +145,8 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
@@ -164,8 +164,8 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
@@ -193,8 +193,8 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
 ; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
-; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
 ; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
@@ -213,8 +213,8 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 093bc20547b85..d0e2c1817f696 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -45,12 +45,11 @@ define <2 x half> @test_ret_const() #0 {
 define half @test_extract_0(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_0(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 0
@@ -60,13 +59,12 @@ define half @test_extract_0(<2 x half> %a) #0 {
 define half @test_extract_1(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 1
   ret half %e
@@ -82,9 +80,8 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
 ; CHECK-NEXT:    setp.eq.b64 %p1, %rd1, 0;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NEXT:    ret;
@@ -110,16 +107,14 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -148,8 +143,7 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0];
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -181,8 +175,7 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0];
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -214,16 +207,14 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -251,8 +242,7 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0];
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 %r3, 0f00000000;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r4, %r3, %r2;
@@ -285,16 +275,14 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -311,16 +299,14 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NEXT:    div.rn.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -345,12 +331,10 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_frem_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_frem_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.f32 %r7, %r6;
@@ -358,8 +342,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    testp.infinite.f32 %p1, %r3;
 ; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r9;
-; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs1;
 ; CHECK-NEXT:    div.rn.f32 %r12, %r11, %r10;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r13, %r12;
 ; CHECK-NEXT:    neg.f32 %r14, %r13;
@@ -551,13 +535,11 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
-; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r3, %r4;
-; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-F16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-F16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
-; CHECK-F16-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1];
+; CHECK-F16-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
+; CHECK-F16-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -568,22 +550,18 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs5;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs6;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
-; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; CHECK-NOF16-NEXT:    ret;
   %cc = fcmp une <2 x half> %c, %d
@@ -596,15 +574,16 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .pred %p<3>;
 ; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
-; CHECK-F16-NEXT:    ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
-; CHECK-F16-NEXT:    ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r5, %r6;
-; CHECK-F16-NEXT:    selp.f32 %r7, %r2, %r4, %p2;
-; CHECK-F16-NEXT:    selp.f32 %r8, %r1, %r3, %p1;
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    selp.f32 %r7, %r4, %r6, %p2;
+; CHECK-F16-NEXT:    selp.f32 %r8, %r3, %r5, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -613,22 +592,21 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r6, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r5, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r8, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r10, %r9;
-; CHECK-NOF16-NEXT:    selp.f32 %r11, %r2, %r4, %p2;
-; CHECK-NOF16-NEXT:    selp.f32 %r12, %r1, %r3, %p1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    selp.f32 %r11, %r4, %r10, %p2;
+; CHECK-NOF16-NEXT:    selp.f32 %r12, %r3, %r9, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r11};
 ; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
@@ -643,18 +621,17 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 ; CHECK-NEXT:    .reg .pred %p<3>;
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0];
 ; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
 ; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
 ; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
-; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
+; CHECK-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
@@ -687,15 +664,13 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -730,15 +705,13 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -773,15 +746,13 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -816,15 +787,13 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -859,15 +828,13 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -902,15 +869,13 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -946,15 +911,13 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -989,15 +952,13 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1032,15 +993,13 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1075,15 +1034,13 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1118,15 +1075,13 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1161,15 +1116,13 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1204,15 +1157,13 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1247,15 +1198,13 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1273,8 +1222,7 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i32_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0];
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1291,8 +1239,7 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i64_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0];
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1308,8 +1255,7 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi32_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0];
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1326,8 +1272,7 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi64_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0];
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1424,17 +1369,16 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1467,17 +1411,16 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1490,15 +1433,11 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %r2;
-; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r1;
-; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %r
@@ -1529,8 +1468,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0];
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1547,8 +1485,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xdouble_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0];
 ; CHECK-NEXT:    cvt.f64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.f64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1641,8 +1578,7 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0];
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1670,8 +1606,7 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0];
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1692,8 +1627,7 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0];
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1769,20 +1703,17 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -1809,8 +1740,7 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0];
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    abs.f32 %r3, %r2;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1831,16 +1761,14 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NEXT:    min.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NEXT:    min.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1857,16 +1785,14 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0];
+; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
 ; CHECK-NEXT:    max.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
 ; CHECK-NEXT:    max.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1896,15 +1822,13 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, -32768;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs5, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs1, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs4, 32767;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1];
+; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs4, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs2, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs3, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs1, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs7};
 ; CHECK-NOF16-NEXT:    ret;
@@ -1917,10 +1841,11 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-F16-NEXT:    .reg .b32 %r<8>;
+; CHECK-F16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %r3;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
@@ -1934,19 +1859,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<6>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b32 %r4, %r3, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; }
-; CHECK-NOF16-NEXT:    mov.b32 {%rs2, %rs3}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs3, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs4, %rs1;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; }
+; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs3, %rs4;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
 ; CHECK-NOF16-NEXT:    and.b32 %r5, %r2, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; }
-; CHECK-NOF16-NEXT:    and.b16 %rs7, %rs2, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs7, %rs6;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; }
+; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs6, %rs7;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs8, %rs5};
 ; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x float> %b to <2 x half>
@@ -1981,8 +1906,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0];
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b64 %rd3, %rd2, -9223372036854775808;
 ; CHECK-NOF16-NEXT:    shr.u64 %rd4, %rd3, 48;
@@ -2024,15 +1948,13 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs1, -32768;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs4, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs2, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs5, 32767;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1];
+; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs3, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs4, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs10;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs7;
@@ -2050,8 +1972,7 @@ define <2 x half> @test_floor(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0];
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2067,8 +1988,7 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0];
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2084,8 +2004,7 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0];
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2101,8 +2020,7 @@ define <2 x half> @test_rint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0];
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2118,8 +2036,7 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_nearbyint_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0];
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2135,8 +2052,7 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_roundeven_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0];
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2154,8 +2070,7 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<21>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0];
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
 ; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
@@ -2206,20 +2121,17 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
-; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
-; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
+; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -2236,8 +2148,7 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
@@ -2247,13 +2158,12 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 {
 ; CHECK-LABEL: test_insertelement(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
+; CHECK-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %i = insertelement <2 x half> %a, half %x, i64 1
@@ -2267,8 +2177,7 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2284,8 +2193,7 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
new file mode 100644
index 0000000000000..af3cb63082e78
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -0,0 +1,1962 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Full FP32x2 support enabled by default.
+; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all            \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s
+; RUN: %if ptxas-12.7 %{                                                       \
+; RUN:  llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all           \
+; RUN:  -verify-machineinstrs | %ptxas-verify -arch=sm_80                      \
+; RUN: %}
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all           \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s
+; RUN: %if ptxas-12.7 %{                                                       \
+; RUN:  llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all          \
+; RUN:  -verify-machineinstrs | %ptxas-verify -arch=sm_100                     \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {0f3F800000, 0f40000000};
+; CHECK-NEXT:    ret;
+  ret <2 x float> <float 1.0, float 2.0>
+}
+
+define float @test_extract_0(<2 x float> %a) #0 {
+; CHECK-LABEL: test_extract_0(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %e = extractelement <2 x float> %a, i32 0
+  ret float %e
+}
+
+define float @test_extract_1(<2 x float> %a) #0 {
+; CHECK-LABEL: test_extract_1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %e = extractelement <2 x float> %a, i32 1
+  ret float %e
+}
+
+; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
+; test_extract_i_param_0 where the symbol's address is not taken first (that
+; is, moved to a temporary)
+; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+;   %e = extractelement <2 x float> %a, i64 %idx
+;   ret float %e
+; }
+
+define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_param_0];
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_0(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_0(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd2, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_1(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_1(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd2, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+  ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd_v4(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r9, %r4, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r10, %r3, %r7;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r11, %r2, %r6;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r12, %r1, %r5;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_v4(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd5, %rd2, %rd4;
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd6, %rd1, %rd3;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd5};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_0_v4(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40800000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f40400000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd3, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    mov.b32 %r3, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r4, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd5, {%r4, %r3};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd6, %rd1, %rd5;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd4};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_1_v4(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40800000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f40400000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd3, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    mov.b32 %r3, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r4, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd5, {%r4, %r3};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd6, %rd1, %rd5;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd4};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-NOF32X2-LABEL: test_fsub(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1];
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fsub(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_param_0];
+; CHECK-F32X2-NEXT:    sub.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fsub <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fneg(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0];
+; CHECK-NEXT:    neg.f32 %r3, %r2;
+; CHECK-NEXT:    neg.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = fneg <2 x float> %a
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-NOF32X2-LABEL: test_fmul(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1];
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fmul(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_param_0];
+; CHECK-F32X2-NEXT:    mul.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fmul <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fdiv(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1];
+; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    div.rn.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %r = fdiv <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_frem(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1];
+; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NEXT:    neg.f32 %r7, %r6;
+; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r4, %r2;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
+; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
+; CHECK-NEXT:    div.rn.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r11, %r10;
+; CHECK-NEXT:    neg.f32 %r12, %r11;
+; CHECK-NEXT:    fma.rn.f32 %r13, %r12, %r3, %r1;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
+; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-NEXT:    ret;
+  %r = frem <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_ftz_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_ftz_param_0];
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_0_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_0_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd2, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_1_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_1_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd2, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+  ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_v4_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r9, %r4, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r10, %r3, %r7;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r11, %r2, %r6;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r12, %r1, %r5;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_v4_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd5, %rd2, %rd4;
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd6, %rd1, %rd3;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd5};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40800000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f40400000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd3, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    mov.b32 %r3, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r4, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd5, {%r4, %r3};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd4};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
+; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<7>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-F32X2-NEXT:    mov.b32 %r1, 0f40800000;
+; CHECK-F32X2-NEXT:    mov.b32 %r2, 0f40400000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd3, {%r2, %r1};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    mov.b32 %r3, 0f40000000;
+; CHECK-F32X2-NEXT:    mov.b32 %r4, 0f3F800000;
+; CHECK-F32X2-NEXT:    mov.b64 %rd5, {%r4, %r3};
+; CHECK-F32X2-NEXT:    add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
+; CHECK-F32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd6, %rd4};
+; CHECK-F32X2-NEXT:    ret;
+  %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-NOF32X2-LABEL: test_fsub_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fsub_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_ftz_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_ftz_param_0];
+; CHECK-F32X2-NEXT:    sub.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fsub <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
+; CHECK-LABEL: test_fneg_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0];
+; CHECK-NEXT:    neg.ftz.f32 %r3, %r2;
+; CHECK-NEXT:    neg.ftz.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = fneg <2 x float> %a
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-NOF32X2-LABEL: test_fmul_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fmul_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_ftz_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_ftz_param_0];
+; CHECK-F32X2-NEXT:    mul.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %r = fmul <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
+; CHECK-NOF32X2-LABEL: test_fma_ftz(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2];
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fma_ftz(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd3, [test_fma_ftz_param_2];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fma_ftz_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fma_ftz_param_0];
+; CHECK-F32X2-NEXT:    fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-F32X2-NEXT:    ret;
+  %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_fdiv_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1];
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    div.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %r = fdiv <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-LABEL: test_frem_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1];
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r6, %r5;
+; CHECK-NEXT:    neg.ftz.f32 %r7, %r6;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r8, %r7, %r4, %r2;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
+; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r11, %r10;
+; CHECK-NEXT:    neg.ftz.f32 %r12, %r11;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r13, %r12, %r3, %r1;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
+; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
+; CHECK-NEXT:    ret;
+  %r = frem <2 x float> %a, %b
+  ret <2 x float> %r
+}
+
+define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v2f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-NEXT:    ld.b64 %rd3, [%rd1];
+; CHECK-NEXT:    st.b64 [%rd2], %rd3;
+; CHECK-NEXT:    ret;
+  %t1 = load <2 x float>, ptr %a
+  store <2 x float> %t1, ptr %b, align 32
+  ret void
+}
+
+define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v3f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-NEXT:    ld.b64 %rd3, [%rd1];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+8];
+; CHECK-NEXT:    st.b32 [%rd2+8], %r1;
+; CHECK-NEXT:    st.b64 [%rd2], %rd3;
+; CHECK-NEXT:    ret;
+  %t1 = load <3 x float>, ptr %a
+  store <3 x float> %t1, ptr %b, align 32
+  ret void
+}
+
+define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v4f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%rd3, %rd4}, [%rd1];
+; CHECK-NEXT:    st.v2.b64 [%rd2], {%rd3, %rd4};
+; CHECK-NEXT:    ret;
+  %t1 = load <4 x float>, ptr %a
+  store <4 x float> %t1, ptr %b, align 32
+  ret void
+}
+
+define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: test_ldst_v8f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%rd3, %rd4}, [%rd1];
+; CHECK-NEXT:    ld.v2.b64 {%rd5, %rd6}, [%rd1+16];
+; CHECK-NEXT:    st.v2.b64 [%rd2+16], {%rd5, %rd6};
+; CHECK-NEXT:    st.v2.b64 [%rd2], {%rd3, %rd4};
+; CHECK-NEXT:    ret;
+  %t1 = load <8 x float>, ptr %a
+  store <8 x float> %t1, ptr %b, align 32
+  ret void
+}
+
+declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
+
+define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_call(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_call_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_call_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_call_flipped(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_call_flipped_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_call_flipped_param_0];
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
+  %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_param_0];
+; CHECK-NEXT:    selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
+  %r = select i1 %c, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-LABEL: test_select_cc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3];
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1];
+; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p2;
+; CHECK-NEXT:    selp.f32 %r10, %r1, %r7, %p1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NEXT:    ret;
+  %cc = fcmp une <2 x float> %c, %d
+  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-LABEL: test_select_cc_f64_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT:    setp.neu.f32 %p1, %r1, %r3;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r2, %r4;
+; CHECK-NEXT:    selp.f64 %rd7, %rd2, %rd4, %p2;
+; CHECK-NEXT:    selp.f64 %rd8, %rd1, %rd3, %p1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd8, %rd7};
+; CHECK-NEXT:    ret;
+  %cc = fcmp une <2 x float> %c, %d
+  %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %r
+}
+
+define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
+; CHECK-LABEL: test_select_cc_f32_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0];
+; CHECK-NEXT:    setp.neu.f64 %p1, %rd3, %rd5;
+; CHECK-NEXT:    setp.neu.f64 %p2, %rd4, %rd6;
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT:    selp.f32 %r5, %r2, %r4, %p2;
+; CHECK-NEXT:    selp.f32 %r6, %r1, %r3, %p1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %cc = fcmp une <2 x double> %c, %d
+  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_une(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1];
+; CHECK-NEXT:    setp.neu.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp une <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1];
+; CHECK-NEXT:    setp.equ.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.equ.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ueq <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1];
+; CHECK-NEXT:    setp.gtu.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.gtu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ugt <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1];
+; CHECK-NEXT:    setp.geu.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.geu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp uge <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1];
+; CHECK-NEXT:    setp.ltu.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.ltu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ult <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1];
+; CHECK-NEXT:    setp.leu.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.leu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ule <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1];
+; CHECK-NEXT:    setp.nan.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.nan.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp uno <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_one(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1];
+; CHECK-NEXT:    setp.ne.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.ne.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp one <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1];
+; CHECK-NEXT:    setp.eq.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.eq.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp oeq <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1];
+; CHECK-NEXT:    setp.gt.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.gt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ogt <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1];
+; CHECK-NEXT:    setp.ge.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.ge.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp oge <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1];
+; CHECK-NEXT:    setp.lt.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp olt <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1];
+; CHECK-NEXT:    setp.le.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.le.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ole <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1];
+; CHECK-NEXT:    setp.num.f32 %p1, %r2, %r4;
+; CHECK-NEXT:    setp.num.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT:    ret;
+  %r = fcmp ord <2 x float> %a, %b
+  ret <2 x i1> %r
+}
+
+define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    cvt.rzi.s32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rzi.s32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    cvt.rzi.s64.f32 %rd2, %r2;
+; CHECK-NEXT:    cvt.rzi.s64.f32 %rd3, %r1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT:    ret;
+  %r = fptosi <2 x float> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    cvt.rzi.u32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rzi.u32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    cvt.rzi.u64.f32 %rd2, %r2;
+; CHECK-NEXT:    cvt.rzi.u64.f32 %rd3, %r1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT:    ret;
+  %r = fptoui <2 x float> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-NEXT:    cvt.rn.f32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.rn.f32.u64 %r2, %rd1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT:    ret;
+  %r = uitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f32.s32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-NEXT:    cvt.rn.f32.s64 %r1, %rd2;
+; CHECK-NEXT:    cvt.rn.f32.s64 %r2, %rd1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT:    ret;
+  %r = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
+; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r3, %r1;
+; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r4, %r2;
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, %r4;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, %r3;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-F32X2-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-F32X2-NEXT:    cvt.rn.f32.u32 %r4, %r1;
+; CHECK-F32X2-NEXT:    mov.b64 %rd2, {%r4, %r3};
+; CHECK-F32X2-NEXT:    add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-F32X2-NEXT:    ret;
+  %c = uitofp <2 x i32> %a to <2 x float>
+  %r = fadd <2 x float> %b, %c
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT:    cvt.rn.f32.f64 %r1, %rd2;
+; CHECK-NEXT:    cvt.rn.f32.f64 %r2, %rd1;
+; CHECK-NEXT:    mov.b64 %rd3, {%r2, %r1};
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
+  %r = fptrunc <2 x double> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    cvt.f64.f32 %rd2, %r2;
+; CHECK-NEXT:    cvt.f64.f32 %rd3, %r1;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT:    ret;
+  %r = fpext <2 x float> %a to <2 x double>
+  ret <2 x double> %r
+}
+
+define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
+  %r = bitcast <2 x float> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NEXT:    ret;
+  %r = bitcast <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
+; CHECK-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
+  %r = bitcast double %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
+  %r = bitcast <2 x float> %a to double
+  ret double %r
+}
+
+define <2 x float> @test_sqrt(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
+; CHECK-NEXT:    sqrt.rn.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.sqrt(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x float> @test_powi(<2 x float> %a, <2 x i32> %b) #0 {
+;  %r = call <2 x float> @llvm.powi.i32(<2 x float> %a, <2 x i32> %b)
+;  ret <2 x float> %r
+;}
+
+define <2 x float> @test_sin(<2 x float> %a) #0 #1 {
+; CHECK-LABEL: test_sin(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0];
+; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
+; CHECK-NEXT:    sin.approx.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.sin(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_cos(<2 x float> %a) #0 #1 {
+; CHECK-LABEL: test_cos(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0];
+; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
+; CHECK-NEXT:    cos.approx.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.cos(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x float> @test_pow(<2 x float> %a, <2 x float> %b) #0 {
+;  %r = call <2 x float> @llvm.pow(<2 x float> %a, <2 x float> %b)
+;  ret <2 x float> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x float> @test_exp(<2 x float> %a) #0 {
+;  %r = call <2 x float> @llvm.exp(<2 x float> %a)
+;  ret <2 x float> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x float> @test_exp2(<2 x float> %a) #0 {
+;  %r = call <2 x float> @llvm.exp2(<2 x float> %a)
+;  ret <2 x float> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x float> @test_log(<2 x float> %a) #0 {
+;  %r = call <2 x float> @llvm.log(<2 x float> %a)
+;  ret <2 x float> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x float> @test_log10(<2 x float> %a) #0 {
+;  %r = call <2 x float> @llvm.log10(<2 x float> %a)
+;  ret <2 x float> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x float> @test_log2(<2 x float> %a) #0 {
+;  %r = call <2 x float> @llvm.log2(<2 x float> %a)
+;  ret <2 x float> %r
+;}
+
+
+define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+; CHECK-NOF32X2-LABEL: test_fma(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2];
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fma(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd3, [test_fma_param_2];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fma_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fma_param_0];
+; CHECK-F32X2-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-F32X2-NEXT:    ret;
+  %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fabs(<2 x float> %a) #0 {
+; CHECK-LABEL: test_fabs(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0];
+; CHECK-NEXT:    abs.f32 %r3, %r2;
+; CHECK-NEXT:    abs.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.fabs(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_minnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1];
+; CHECK-NEXT:    min.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    min.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_maxnum(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1];
+; CHECK-NEXT:    max.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    max.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1];
+; CHECK-NEXT:    copysign.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    copysign.f32 %r6, %r3, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0];
+; CHECK-NEXT:    abs.f32 %r3, %r2;
+; CHECK-NEXT:    neg.f32 %r4, %r3;
+; CHECK-NEXT:    shr.u64 %rd4, %rd3, 63;
+; CHECK-NEXT:    and.b64 %rd5, %rd4, 1;
+; CHECK-NEXT:    setp.ne.b64 %p1, %rd5, 0;
+; CHECK-NEXT:    selp.f32 %r5, %r4, %r3, %p1;
+; CHECK-NEXT:    abs.f32 %r6, %r1;
+; CHECK-NEXT:    neg.f32 %r7, %r6;
+; CHECK-NEXT:    shr.u64 %rd6, %rd2, 63;
+; CHECK-NEXT:    and.b64 %rd7, %rd6, 1;
+; CHECK-NEXT:    setp.ne.b64 %p2, %rd7, 0;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p2;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r5};
+; CHECK-NEXT:    ret;
+  %tb = fptrunc <2 x double> %b to <2 x float>
+  %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb)
+  ret <2 x float> %r
+}
+
+define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_extended(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1];
+; CHECK-NEXT:    copysign.f32 %r5, %r3, %r1;
+; CHECK-NEXT:    copysign.f32 %r6, %r4, %r2;
+; CHECK-NEXT:    cvt.f64.f32 %rd3, %r6;
+; CHECK-NEXT:    cvt.f64.f32 %rd4, %r5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b)
+  %xr = fpext <2 x float> %r to <2 x double>
+  ret <2 x double> %xr
+}
+
+define <2 x float> @test_floor(<2 x float> %a) #0 {
+; CHECK-LABEL: test_floor(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0];
+; CHECK-NEXT:    cvt.rmi.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rmi.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.floor(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_ceil(<2 x float> %a) #0 {
+; CHECK-LABEL: test_ceil(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0];
+; CHECK-NEXT:    cvt.rpi.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rpi.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.ceil(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_trunc(<2 x float> %a) #0 {
+; CHECK-LABEL: test_trunc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0];
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.trunc(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_rint(<2 x float> %a) #0 {
+; CHECK-LABEL: test_rint(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.rint(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_nearbyint(<2 x float> %a) #0 {
+; CHECK-LABEL: test_nearbyint(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.nearbyint(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_roundeven(<2 x float> %a) #0 {
+; CHECK-LABEL: test_roundeven(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
+; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.roundeven(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+; check the use of sign mask and 0.5 to implement round
+define <2 x float> @test_round(<2 x float> %a) #0 {
+; CHECK-LABEL: test_round(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_round_param_0];
+; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
+; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
+; CHECK-NEXT:    add.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NEXT:    abs.f32 %r7, %r2;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r7, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r8, %r2, %r6, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r9, %r2;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r7, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r10, %r9, %r8, %p2;
+; CHECK-NEXT:    and.b32 %r11, %r1, -2147483648;
+; CHECK-NEXT:    or.b32 %r12, %r11, 1056964608;
+; CHECK-NEXT:    add.rn.f32 %r13, %r1, %r12;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r14, %r13;
+; CHECK-NEXT:    abs.f32 %r15, %r1;
+; CHECK-NEXT:    setp.gt.f32 %p3, %r15, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %r16, %r1, %r14, %p3;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r17, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p4, %r15, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %r18, %r17, %r16, %p4;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r18, %r10};
+; CHECK-NEXT:    ret;
+  %r = call <2 x float> @llvm.round(<2 x float> %a)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+; CHECK-NOF32X2-LABEL: test_fmuladd(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2];
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_fmuladd(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd3, [test_fmuladd_param_2];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd2, [test_fmuladd_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_fmuladd_param_0];
+; CHECK-F32X2-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-F32X2-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-F32X2-NEXT:    ret;
+  %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
+; CHECK-LABEL: test_shufflevector(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT:    ret;
+  %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  ret <2 x float> %s
+}
+
+define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 {
+; CHECK-LABEL: test_insertelement(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT:    ret;
+  %i = insertelement <2 x float> %a, float %x, i64 1
+  ret <2 x float> %i
+}
+
+define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f32.s32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r4, %r1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %r = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
+attributes #2 = { "denormal-fp-math"="preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll
new file mode 100644
index 0000000000000..dc0ec0ff7bb0b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define <2 x float> @t0(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; FAST-LABEL: t0(
+; FAST:       {
+; FAST-NEXT:    .reg .b64 %rd<5>;
+; FAST-EMPTY:
+; FAST-NEXT:  // %bb.0:
+; FAST-NEXT:    ld.param.b64 %rd1, [t0_param_0];
+; FAST-NEXT:    ld.param.b64 %rd2, [t0_param_1];
+; FAST-NEXT:    ld.param.b64 %rd3, [t0_param_2];
+; FAST-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; FAST-NEXT:    st.param.b64 [func_retval0], %rd4;
+; FAST-NEXT:    ret;
+;
+; DEFAULT-LABEL: t0(
+; DEFAULT:       {
+; DEFAULT-NEXT:    .reg .b64 %rd<6>;
+; DEFAULT-EMPTY:
+; DEFAULT-NEXT:  // %bb.0:
+; DEFAULT-NEXT:    ld.param.b64 %rd1, [t0_param_0];
+; DEFAULT-NEXT:    ld.param.b64 %rd2, [t0_param_1];
+; DEFAULT-NEXT:    mul.rn.f32x2 %rd3, %rd1, %rd2;
+; DEFAULT-NEXT:    ld.param.b64 %rd4, [t0_param_2];
+; DEFAULT-NEXT:    add.rn.f32x2 %rd5, %rd3, %rd4;
+; DEFAULT-NEXT:    st.param.b64 [func_retval0], %rd5;
+; DEFAULT-NEXT:    ret;
+  %v0 = fmul <2 x float> %a, %b
+  %v1 = fadd <2 x float> %v0, %c
+  ret <2 x float> %v1
+}
+
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32x2
+;; to prevent ptxas from fusing this with anything else.
+define <2 x float> @t1(<2 x float> %a, <2 x float> %b) {
+; FAST-LABEL: t1(
+; FAST:       {
+; FAST-NEXT:    .reg .b64 %rd<6>;
+; FAST-EMPTY:
+; FAST-NEXT:  // %bb.0:
+; FAST-NEXT:    ld.param.b64 %rd1, [t1_param_0];
+; FAST-NEXT:    ld.param.b64 %rd2, [t1_param_1];
+; FAST-NEXT:    add.f32x2 %rd3, %rd1, %rd2;
+; FAST-NEXT:    sub.f32x2 %rd4, %rd1, %rd2;
+; FAST-NEXT:    mul.f32x2 %rd5, %rd3, %rd4;
+; FAST-NEXT:    st.param.b64 [func_retval0], %rd5;
+; FAST-NEXT:    ret;
+;
+; DEFAULT-LABEL: t1(
+; DEFAULT:       {
+; DEFAULT-NEXT:    .reg .b64 %rd<6>;
+; DEFAULT-EMPTY:
+; DEFAULT-NEXT:  // %bb.0:
+; DEFAULT-NEXT:    ld.param.b64 %rd1, [t1_param_0];
+; DEFAULT-NEXT:    ld.param.b64 %rd2, [t1_param_1];
+; DEFAULT-NEXT:    add.rn.f32x2 %rd3, %rd1, %rd2;
+; DEFAULT-NEXT:    sub.rn.f32x2 %rd4, %rd1, %rd2;
+; DEFAULT-NEXT:    mul.rn.f32x2 %rd5, %rd3, %rd4;
+; DEFAULT-NEXT:    st.param.b64 [func_retval0], %rd5;
+; DEFAULT-NEXT:    ret;
+  %v1 = fadd <2 x float> %a, %b
+  %v2 = fsub <2 x float> %a, %b
+  %v3 = fmul <2 x float> %v1, %v2
+  ret <2 x float> %v3
+}
+
+;; Make sure we generate the non ".rn" version when the "contract" flag is
+;; present on the instructions
+define <2 x float> @t2(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: t2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [t2_param_1];
+; CHECK-NEXT:    add.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    sub.f32x2 %rd4, %rd1, %rd2;
+; CHECK-NEXT:    mul.f32x2 %rd5, %rd3, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
+  %v1 = fadd contract <2 x float> %a, %b
+  %v2 = fsub contract <2 x float> %a, %b
+  %v3 = fmul contract <2 x float> %v1, %v2
+  ret <2 x float> %v3
+}
+
+;; Make sure we always fold to fma when the "contract" flag is present
+define <2 x float> @t3(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: t3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t3_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [t3_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [t3_param_2];
+; CHECK-NEXT:    fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
+  %v0 = fmul contract <2 x float> %a, %b
+  %v1 = fadd contract <2 x float> %v0, %c
+  ret <2 x float> %v1
+}
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 2b7a06c33d948..1a61498b10142 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -32,57 +32,31 @@ define <2 x i16> @test_ret_const() #0 {
 }
 
 define i16 @test_extract_0(<2 x i16> %a) #0 {
-; I16x2-LABEL: test_extract_0(
-; I16x2:       {
-; I16x2-NEXT:    .reg .b16 %rs<2>;
-; I16x2-NEXT:    .reg .b32 %r<3>;
-; I16x2-EMPTY:
-; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
-; I16x2-NEXT:    mov.b32 {%rs1, _}, %r1;
-; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
-; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
-; I16x2-NEXT:    ret;
-;
-; NO-I16x2-LABEL: test_extract_0(
-; NO-I16x2:       {
-; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
-; NO-I16x2-NEXT:    .reg .b32 %r<3>;
-; NO-I16x2-EMPTY:
-; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
-; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
-; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
-; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
-; NO-I16x2-NEXT:    ret;
+; COMMON-LABEL: test_extract_0(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 0
   ret i16 %e
 }
 
 define i16 @test_extract_1(<2 x i16> %a) #0 {
-; I16x2-LABEL: test_extract_1(
-; I16x2:       {
-; I16x2-NEXT:    .reg .b16 %rs<2>;
-; I16x2-NEXT:    .reg .b32 %r<3>;
-; I16x2-EMPTY:
-; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
-; I16x2-NEXT:    mov.b32 {_, %rs1}, %r1;
-; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
-; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
-; I16x2-NEXT:    ret;
-;
-; NO-I16x2-LABEL: test_extract_1(
-; NO-I16x2:       {
-; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
-; NO-I16x2-NEXT:    .reg .b32 %r<3>;
-; NO-I16x2-EMPTY:
-; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
-; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
-; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
-; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
-; NO-I16x2-NEXT:    ret;
+; COMMON-LABEL: test_extract_1(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<3>;
+; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
+; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 1
   ret i16 %e
 }
@@ -97,9 +71,8 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
 ; COMMON-NEXT:    setp.eq.b64 %p1, %rd1, 0;
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs3;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -126,12 +99,10 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_add_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
-; NO-I16x2-NEXT:    add.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0];
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1];
+; NO-I16x2-NEXT:    add.s16 %rs5, %rs2, %rs4;
+; NO-I16x2-NEXT:    add.s16 %rs6, %rs1, %rs3;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> %a, %b
@@ -157,8 +128,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0];
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -185,8 +155,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0];
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -202,12 +171,10 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
-; COMMON-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
-; COMMON-NEXT:    sub.s16 %rs6, %rs3, %rs1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1];
+; COMMON-NEXT:    sub.s16 %rs5, %rs2, %rs4;
+; COMMON-NEXT:    sub.s16 %rs6, %rs1, %rs3;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = sub <2 x i16> %a, %b
@@ -232,12 +199,10 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
-; NO-I16x2-NEXT:    max.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0];
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1];
+; NO-I16x2-NEXT:    max.s16 %rs5, %rs2, %rs4;
+; NO-I16x2-NEXT:    max.s16 %rs6, %rs1, %rs3;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sgt <2 x i16> %a, %b
@@ -263,12 +228,10 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
-; NO-I16x2-NEXT:    max.u16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0];
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1];
+; NO-I16x2-NEXT:    max.u16 %rs5, %rs2, %rs4;
+; NO-I16x2-NEXT:    max.u16 %rs6, %rs1, %rs3;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ugt <2 x i16> %a, %b
@@ -294,12 +257,10 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
-; NO-I16x2-NEXT:    min.s16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0];
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1];
+; NO-I16x2-NEXT:    min.s16 %rs5, %rs2, %rs4;
+; NO-I16x2-NEXT:    min.s16 %rs6, %rs1, %rs3;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sle <2 x i16> %a, %b
@@ -325,12 +286,10 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
-; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
-; NO-I16x2-NEXT:    min.u16 %rs6, %rs3, %rs1;
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0];
+; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1];
+; NO-I16x2-NEXT:    min.u16 %rs5, %rs2, %rs4;
+; NO-I16x2-NEXT:    min.u16 %rs6, %rs1, %rs3;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ule <2 x i16> %a, %b
@@ -345,12 +304,10 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
-; COMMON-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
-; COMMON-NEXT:    mul.lo.s16 %rs6, %rs3, %rs1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1];
+; COMMON-NEXT:    mul.lo.s16 %rs5, %rs2, %rs4;
+; COMMON-NEXT:    mul.lo.s16 %rs6, %rs1, %rs3;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = mul <2 x i16> %a, %b
@@ -729,18 +686,14 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
-; COMMON-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
-; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
-; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
-; COMMON-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
-; COMMON-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; COMMON-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
-; COMMON-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3];
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs5;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs6;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
+; COMMON-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
+; COMMON-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; COMMON-NEXT:    ret;
   %cc = icmp ne <2 x i16> %c, %d
@@ -758,12 +711,10 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
-; COMMON-NEXT:    ld.param.b32 %r6, [test_select_cc_i32_i16_param_3];
-; COMMON-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i16_param_2];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
-; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3];
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs1, %rs3;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs2, %rs4;
 ; COMMON-NEXT:    selp.b32 %r7, %r2, %r4, %p2;
 ; COMMON-NEXT:    selp.b32 %r8, %r1, %r3, %p1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -784,14 +735,12 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
-; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_i16_i32_param_1];
-; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_i16_i32_param_0];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0];
 ; COMMON-NEXT:    setp.ne.b32 %p1, %r3, %r5;
 ; COMMON-NEXT:    setp.ne.b32 %p2, %r4, %r6;
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
-; COMMON-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
-; COMMON-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1];
+; COMMON-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
+; COMMON-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
                                           <2 x i32> %c, <2 x i32> %d) #0 {
@@ -902,8 +851,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0];
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; COMMON-NEXT:    cvt.u32.u16 %r3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -920,8 +868,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0];
 ; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
 ; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
 ; COMMON-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -979,8 +926,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; COMMON-NEXT:    ret;
   %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
@@ -988,29 +934,16 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 }
 
 define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
-; I16x2-LABEL: test_insertelement(
-; I16x2:       {
-; I16x2-NEXT:    .reg .b16 %rs<3>;
-; I16x2-NEXT:    .reg .b32 %r<2>;
-; I16x2-EMPTY:
-; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
-; I16x2-NEXT:    mov.b32 {%rs2, _}, %r1;
-; I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
-; I16x2-NEXT:    ret;
-;
-; NO-I16x2-LABEL: test_insertelement(
-; NO-I16x2:       {
-; NO-I16x2-NEXT:    .reg .b16 %rs<3>;
-; NO-I16x2-NEXT:    .reg .b32 %r<2>;
-; NO-I16x2-EMPTY:
-; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
-; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
-; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
-; NO-I16x2-NEXT:    ret;
+; COMMON-LABEL: test_insertelement(
+; COMMON:       {
+; COMMON-NEXT:    .reg .b16 %rs<4>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
+; COMMON-EMPTY:
+; COMMON-NEXT:  // %bb.0:
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; COMMON-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
+; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
+; COMMON-NEXT:    ret;
   %i = insertelement <2 x i16> %a, i16 %x, i64 1
   ret <2 x i16> %i
 }
@@ -1022,8 +955,7 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0];
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -1039,8 +971,7 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0];
-; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0];
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 328da60a1f783..1fc42d6cc02c0 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1240,18 +1240,16 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
 ; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
@@ -1271,18 +1269,16 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
+; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
 ; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 9e9705709f2bd..efa2666090ccc 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -7,17 +7,16 @@ declare <4 x float> @bar()
 define void @foo(ptr %ptr) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
 ; CHECK-NEXT:    call.uni (retval0), bar, ();
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %val = tail call <4 x float> @bar()
   store <4 x float> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
index a9bd3c1caebe5..187ccc9cd89f7 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
@@ -110,11 +110,11 @@ define void @avar_i64() {
 define void @avar_float() {
 ; PTX-LABEL: avar_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
-; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0
   store <8 x float> %load, ptr addrspace(1) @globalout
@@ -234,11 +234,11 @@ define void @asi_i64() {
 define void @asi_float() {
 ; PTX-LABEL: asi_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
-; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
@@ -364,14 +364,13 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_float_param_0];
-; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_float_param_1];
-; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_float_param_1];
+; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0
   store <8 x float> %load, ptr addrspace(1) %out
@@ -510,14 +509,13 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_float_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_float_param_1];
-; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
-; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
index 45e17016d8ee8..a17df1ee39883 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
@@ -106,11 +106,11 @@ define void @avar_i64() {
 define void @avar_float() {
 ; PTX-LABEL: avar_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
-; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
+; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) @globalin
   store <8 x float> %load, ptr addrspace(1) @globalout
@@ -230,11 +230,11 @@ define void @asi_i64() {
 define void @asi_float() {
 ; PTX-LABEL: asi_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
-; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
+; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset
@@ -360,14 +360,13 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: areg_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_float_param_0];
-; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_float_param_1];
-; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_float_param_1];
+; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; PTX-NEXT:    ret;
   %load = load <8 x float>, ptr addrspace(1) %in
   store <8 x float> %load, ptr addrspace(1) %out
@@ -506,14 +505,13 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; PTX-LABEL: ari_64_float(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<9>;
-; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_float_param_0];
 ; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_float_param_1];
-; PTX-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
-; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; PTX-NEXT:    ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
+; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
 ; PTX-NEXT:    ret;
   %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
   %load = load <8 x float>, ptr addrspace(1) %in.offset
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
index dfbc2c34b15d4..68c53cde7f9ac 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
@@ -139,16 +139,15 @@ define void @generic_4xi64(ptr %a, ptr %b) {
 define void @generic_8xfloat(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xfloat_param_0];
-; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [generic_8xfloat_param_1];
-; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [generic_8xfloat_param_1];
+; CHECK-NEXT:    st.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr %a
   store <8 x float> %a.load, ptr %b
@@ -291,16 +290,15 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) {
 define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
 ; CHECK-LABEL: generic_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr %a
   store volatile <8 x float> %a.load, ptr %b
@@ -516,28 +514,26 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_8xfloat(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<9>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-NEXT:    .reg .b64 %rd<7>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
-; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; SM90-NEXT:    ld.param.b64 %rd2, [global_8xfloat_param_1];
-; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM90-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM90-NEXT:    ld.param.b64 %rd6, [global_8xfloat_param_1];
+; SM90-NEXT:    st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM90-NEXT:    st.global.v2.b64 [%rd6], {%rd2, %rd3};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_8xfloat(
 ; SM100:       {
-; SM100-NEXT:    .reg .b32 %r<9>;
-; SM100-NEXT:    .reg .b64 %rd<3>;
+; SM100-NEXT:    .reg .b64 %rd<7>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
-; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT:    ld.param.b64 %rd2, [global_8xfloat_param_1];
-; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; SM100-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT:    ld.param.b64 %rd6, [global_8xfloat_param_1];
+; SM100-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; SM100-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(1) %a
   store <8 x float> %a.load, ptr addrspace(1) %b
@@ -762,28 +758,26 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SM90-LABEL: global_volatile_8xfloat(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<9>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-NEXT:    .reg .b64 %rd<7>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
-; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
-; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
+; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; SM90-NEXT:    ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
+; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
+; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
 ; SM90-NEXT:    ret;
 ;
 ; SM100-LABEL: global_volatile_8xfloat(
 ; SM100:       {
-; SM100-NEXT:    .reg .b32 %r<9>;
-; SM100-NEXT:    .reg .b64 %rd<3>;
+; SM100-NEXT:    .reg .b64 %rd<7>;
 ; SM100-EMPTY:
 ; SM100-NEXT:  // %bb.0:
 ; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
-; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
-; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
-; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
+; SM100-NEXT:    ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
+; SM100-NEXT:    ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
+; SM100-NEXT:    st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
 ; SM100-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(1) %a
   store volatile <8 x float> %a.load, ptr addrspace(1) %b
@@ -939,16 +933,15 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [shared_8xfloat_param_1];
-; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [shared_8xfloat_param_1];
+; CHECK-NEXT:    st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.shared.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(3) %a
   store <8 x float> %a.load, ptr addrspace(3) %b
@@ -1091,16 +1084,15 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
 ; CHECK-LABEL: shared_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(3) %a
   store volatile <8 x float> %a.load, ptr addrspace(3) %b
@@ -1245,16 +1237,15 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [local_8xfloat_param_1];
-; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [local_8xfloat_param_1];
+; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <8 x float>, ptr addrspace(5) %a
   store <8 x float> %a.load, ptr addrspace(5) %b
@@ -1397,16 +1388,15 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 ; CHECK-LABEL: local_volatile_8xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
-; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_8xfloat_param_1];
-; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT:    ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
+; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <8 x float>, ptr addrspace(5) %a
   store volatile <8 x float> %a.load, ptr addrspace(5) %b
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index b6a00e03a80ab..ec8dd0c5c9350 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -333,30 +333,28 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
 define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
 ; SM20-LABEL: foo11(
 ; SM20:       {
-; SM20-NEXT:    .reg .b32 %r<3>;
-; SM20-NEXT:    .reg .b64 %rd<5>;
+; SM20-NEXT:    .reg .b64 %rd<6>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
 ; SM20-NEXT:    ld.param.b64 %rd1, [foo11_param_0];
 ; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; SM20-NEXT:    ld.param.b64 %rd3, [foo11_param_1];
 ; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
-; SM20-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd2];
-; SM20-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
+; SM20-NEXT:    ld.global.b64 %rd5, [%rd2];
+; SM20-NEXT:    st.global.b64 [%rd4], %rd5;
 ; SM20-NEXT:    ret;
 ;
 ; SM35-LABEL: foo11(
 ; SM35:       {
-; SM35-NEXT:    .reg .b32 %r<3>;
-; SM35-NEXT:    .reg .b64 %rd<5>;
+; SM35-NEXT:    .reg .b64 %rd<6>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
 ; SM35-NEXT:    ld.param.b64 %rd1, [foo11_param_0];
 ; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; SM35-NEXT:    ld.param.b64 %rd3, [foo11_param_1];
 ; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
-; SM35-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
-; SM35-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
+; SM35-NEXT:    ld.global.nc.b64 %rd5, [%rd2];
+; SM35-NEXT:    st.global.b64 [%rd4], %rd5;
 ; SM35-NEXT:    ret;
   %1 = load <2 x float>, ptr %from
   store <2 x float> %1, ptr %to
@@ -496,30 +494,28 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
 define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
 ; SM20-LABEL: foo16(
 ; SM20:       {
-; SM20-NEXT:    .reg .b32 %r<5>;
-; SM20-NEXT:    .reg .b64 %rd<5>;
+; SM20-NEXT:    .reg .b64 %rd<7>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
 ; SM20-NEXT:    ld.param.b64 %rd1, [foo16_param_0];
 ; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; SM20-NEXT:    ld.param.b64 %rd3, [foo16_param_1];
 ; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
-; SM20-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; SM20-NEXT:    st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
+; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
+; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
 ; SM20-NEXT:    ret;
 ;
 ; SM35-LABEL: foo16(
 ; SM35:       {
-; SM35-NEXT:    .reg .b32 %r<5>;
-; SM35-NEXT:    .reg .b64 %rd<5>;
+; SM35-NEXT:    .reg .b64 %rd<7>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
 ; SM35-NEXT:    ld.param.b64 %rd1, [foo16_param_0];
 ; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; SM35-NEXT:    ld.param.b64 %rd3, [foo16_param_1];
 ; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
-; SM35-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; SM35-NEXT:    st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
+; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
+; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
 ; SM35-NEXT:    ret;
   %1 = load <4 x float>, ptr %from
   store <4 x float> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index db8733da5b7e4..dfdb33852305b 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -1,131 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; CHECK-LABEL: t1
 define <4 x float> @t1(ptr %p1) {
-; CHECK-NOT: ld.v4
-; CHECK-NOT: ld.v2
-; CHECK-NOT: ld.b32
-; CHECK: ld.b8
+; CHECK-LABEL: t1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<46>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t1_param_0];
+; CHECK-NEXT:    ld.b8 %rd2, [%rd1+8];
+; CHECK-NEXT:    ld.b8 %rd3, [%rd1+9];
+; CHECK-NEXT:    shl.b64 %rd4, %rd3, 8;
+; CHECK-NEXT:    or.b64 %rd5, %rd4, %rd2;
+; CHECK-NEXT:    ld.b8 %rd6, [%rd1+10];
+; CHECK-NEXT:    shl.b64 %rd7, %rd6, 16;
+; CHECK-NEXT:    ld.b8 %rd8, [%rd1+11];
+; CHECK-NEXT:    shl.b64 %rd9, %rd8, 24;
+; CHECK-NEXT:    or.b64 %rd10, %rd9, %rd7;
+; CHECK-NEXT:    or.b64 %rd11, %rd10, %rd5;
+; CHECK-NEXT:    ld.b8 %rd12, [%rd1+12];
+; CHECK-NEXT:    ld.b8 %rd13, [%rd1+13];
+; CHECK-NEXT:    shl.b64 %rd14, %rd13, 8;
+; CHECK-NEXT:    or.b64 %rd15, %rd14, %rd12;
+; CHECK-NEXT:    ld.b8 %rd16, [%rd1+14];
+; CHECK-NEXT:    shl.b64 %rd17, %rd16, 16;
+; CHECK-NEXT:    ld.b8 %rd18, [%rd1+15];
+; CHECK-NEXT:    shl.b64 %rd19, %rd18, 24;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd17;
+; CHECK-NEXT:    or.b64 %rd21, %rd20, %rd15;
+; CHECK-NEXT:    shl.b64 %rd22, %rd21, 32;
+; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd11;
+; CHECK-NEXT:    ld.b8 %rd24, [%rd1];
+; CHECK-NEXT:    ld.b8 %rd25, [%rd1+1];
+; CHECK-NEXT:    shl.b64 %rd26, %rd25, 8;
+; CHECK-NEXT:    or.b64 %rd27, %rd26, %rd24;
+; CHECK-NEXT:    ld.b8 %rd28, [%rd1+2];
+; CHECK-NEXT:    shl.b64 %rd29, %rd28, 16;
+; CHECK-NEXT:    ld.b8 %rd30, [%rd1+3];
+; CHECK-NEXT:    shl.b64 %rd31, %rd30, 24;
+; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd29;
+; CHECK-NEXT:    or.b64 %rd33, %rd32, %rd27;
+; CHECK-NEXT:    ld.b8 %rd34, [%rd1+4];
+; CHECK-NEXT:    ld.b8 %rd35, [%rd1+5];
+; CHECK-NEXT:    shl.b64 %rd36, %rd35, 8;
+; CHECK-NEXT:    or.b64 %rd37, %rd36, %rd34;
+; CHECK-NEXT:    ld.b8 %rd38, [%rd1+6];
+; CHECK-NEXT:    shl.b64 %rd39, %rd38, 16;
+; CHECK-NEXT:    ld.b8 %rd40, [%rd1+7];
+; CHECK-NEXT:    shl.b64 %rd41, %rd40, 24;
+; CHECK-NEXT:    or.b64 %rd42, %rd41, %rd39;
+; CHECK-NEXT:    or.b64 %rd43, %rd42, %rd37;
+; CHECK-NEXT:    shl.b64 %rd44, %rd43, 32;
+; CHECK-NEXT:    or.b64 %rd45, %rd44, %rd33;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd45, %rd23};
+; CHECK-NEXT:    ret;
   %r = load <4 x float>, ptr %p1, align 1
   ret <4 x float> %r
 }
 
-; CHECK-LABEL: t2
 define <4 x float> @t2(ptr %p1) {
-; CHECK-NOT: ld.v4
-; CHECK-NOT: ld.v2
-; CHECK: ld.b32
+; CHECK-LABEL: t2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t2_param_0];
+; CHECK-NEXT:    ld.b32 %rd2, [%rd1+8];
+; CHECK-NEXT:    ld.b32 %rd3, [%rd1+12];
+; CHECK-NEXT:    shl.b64 %rd4, %rd3, 32;
+; CHECK-NEXT:    or.b64 %rd5, %rd4, %rd2;
+; CHECK-NEXT:    ld.b32 %rd6, [%rd1];
+; CHECK-NEXT:    ld.b32 %rd7, [%rd1+4];
+; CHECK-NEXT:    shl.b64 %rd8, %rd7, 32;
+; CHECK-NEXT:    or.b64 %rd9, %rd8, %rd6;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd9, %rd5};
+; CHECK-NEXT:    ret;
   %r = load <4 x float>, ptr %p1, align 4
   ret <4 x float> %r
 }
 
-; CHECK-LABEL: t3
 define <4 x float> @t3(ptr %p1) {
-; CHECK-NOT: ld.v4
-; CHECK: ld.v2
+; CHECK-LABEL: t3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t3_param_0];
+; CHECK-NEXT:    ld.b64 %rd2, [%rd1+8];
+; CHECK-NEXT:    ld.b64 %rd3, [%rd1];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT:    ret;
   %r = load <4 x float>, ptr %p1, align 8
   ret <4 x float> %r
 }
 
-; CHECK-LABEL: t4
 define <4 x float> @t4(ptr %p1) {
-; CHECK: ld.v4
+; CHECK-LABEL: t4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [t4_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd3};
+; CHECK-NEXT:    ret;
   %r = load <4 x float>, ptr %p1, align 16
   ret <4 x float> %r
 }
 
-; CHECK-LABEL: .visible .func test_v1halfp0a1(
-; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0];
-; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1];
-; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.b8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
-; CHECK: ret
 define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
+; CHECK-LABEL: test_v1halfp0a1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v1halfp0a1_param_0];
+; CHECK-NEXT:    ld.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.b8 %rs2, [%rd1+1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_v1halfp0a1_param_1];
+; CHECK-NEXT:    st.b8 [%rd2+1], %rs2;
+; CHECK-NEXT:    st.b8 [%rd2], %rs1;
+; CHECK-NEXT:    ret;
   %1 = load <1 x half>, ptr %from , align 1
   store <1 x half> %1, ptr %to , align 1
   ret void
 }
 
-; CHECK-LABEL: .visible .func test_v2halfp0a1(
-; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0];
-; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1];
-; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.b8        [%[[TO]]],
-; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.b8        [%[[TO]]+1],
-; CHECK-DAG: ld.b8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
-; CHECK-DAG: st.b8        [%[[TO]]+2],
-; CHECK-DAG: ld.b8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
-; CHECK-DAG: st.b8        [%[[TO]]+3],
-; CHECK: ret
 define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
+; CHECK-LABEL: test_v2halfp0a1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2halfp0a1_param_0];
+; CHECK-NEXT:    ld.b8 %r1, [%rd1+1];
+; CHECK-NEXT:    ld.b8 %r2, [%rd1];
+; CHECK-NEXT:    ld.b8 %r3, [%rd1+3];
+; CHECK-NEXT:    ld.b8 %r4, [%rd1+2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_v2halfp0a1_param_1];
+; CHECK-NEXT:    st.b8 [%rd2+2], %r4;
+; CHECK-NEXT:    st.b8 [%rd2+3], %r3;
+; CHECK-NEXT:    st.b8 [%rd2], %r2;
+; CHECK-NEXT:    st.b8 [%rd2+1], %r1;
+; CHECK-NEXT:    ret;
   %1 = load <2 x half>, ptr %from , align 1
   store <2 x half> %1, ptr %to , align 1
   ret void
 }
 
-; CHECK-LABEL: .visible .func test_v4halfp0a1(
-; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0];
-; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1];
-; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.b8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
-; CHECK-DAG: ld.b8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
-; CHECK-DAG: st.b8        [%[[TO]]+2], [[B2]]
-; CHECK-DAG: ld.b8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
-; CHECK-DAG: st.b8        [%[[TO]]+3], [[B3]]
-; CHECK-DAG: ld.b8        [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4]
-; CHECK-DAG: st.b8        [%[[TO]]+4], [[B4]]
-; CHECK-DAG: ld.b8        [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5]
-; CHECK-DAG: st.b8        [%[[TO]]+5], [[B5]]
-; CHECK-DAG: ld.b8        [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6]
-; CHECK-DAG: st.b8        [%[[TO]]+6], [[B6]]
-; CHECK-DAG: ld.b8        [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7]
-; CHECK-DAG: st.b8        [%[[TO]]+7], [[B7]]
-; CHECK: ret
 define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
+; CHECK-LABEL: test_v4halfp0a1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v4halfp0a1_param_0];
+; CHECK-NEXT:    ld.b8 %r1, [%rd1+1];
+; CHECK-NEXT:    ld.b8 %r2, [%rd1];
+; CHECK-NEXT:    ld.b8 %r3, [%rd1+3];
+; CHECK-NEXT:    ld.b8 %r4, [%rd1+2];
+; CHECK-NEXT:    ld.b8 %r5, [%rd1+5];
+; CHECK-NEXT:    ld.b8 %r6, [%rd1+4];
+; CHECK-NEXT:    ld.b8 %r7, [%rd1+7];
+; CHECK-NEXT:    ld.b8 %r8, [%rd1+6];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_v4halfp0a1_param_1];
+; CHECK-NEXT:    st.b8 [%rd2+6], %r8;
+; CHECK-NEXT:    st.b8 [%rd2+7], %r7;
+; CHECK-NEXT:    st.b8 [%rd2+4], %r6;
+; CHECK-NEXT:    st.b8 [%rd2+5], %r5;
+; CHECK-NEXT:    st.b8 [%rd2+2], %r4;
+; CHECK-NEXT:    st.b8 [%rd2+3], %r3;
+; CHECK-NEXT:    st.b8 [%rd2], %r2;
+; CHECK-NEXT:    st.b8 [%rd2+1], %r1;
+; CHECK-NEXT:    ret;
   %1 = load <4 x half>, ptr %from , align 1
   store <4 x half> %1, ptr %to , align 1
   ret void
 }
 
 
-; CHECK-LABEL: s1
 define void @s1(ptr %p1, <4 x float> %v) {
-; CHECK-NOT: st.v4
-; CHECK-NOT: st.v2
-; CHECK-NOT: st.b32
-; CHECK: st.b8
+; CHECK-LABEL: s1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<18>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [s1_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1];
+; CHECK-NEXT:    st.b8 [%rd1+8], %rd3;
+; CHECK-NEXT:    st.b8 [%rd1], %rd2;
+; CHECK-NEXT:    shr.u64 %rd4, %rd3, 56;
+; CHECK-NEXT:    st.b8 [%rd1+15], %rd4;
+; CHECK-NEXT:    shr.u64 %rd5, %rd3, 48;
+; CHECK-NEXT:    st.b8 [%rd1+14], %rd5;
+; CHECK-NEXT:    shr.u64 %rd6, %rd3, 40;
+; CHECK-NEXT:    st.b8 [%rd1+13], %rd6;
+; CHECK-NEXT:    shr.u64 %rd7, %rd3, 32;
+; CHECK-NEXT:    st.b8 [%rd1+12], %rd7;
+; CHECK-NEXT:    shr.u64 %rd8, %rd3, 24;
+; CHECK-NEXT:    st.b8 [%rd1+11], %rd8;
+; CHECK-NEXT:    shr.u64 %rd9, %rd3, 16;
+; CHECK-NEXT:    st.b8 [%rd1+10], %rd9;
+; CHECK-NEXT:    shr.u64 %rd10, %rd3, 8;
+; CHECK-NEXT:    st.b8 [%rd1+9], %rd10;
+; CHECK-NEXT:    shr.u64 %rd11, %rd2, 56;
+; CHECK-NEXT:    st.b8 [%rd1+7], %rd11;
+; CHECK-NEXT:    shr.u64 %rd12, %rd2, 48;
+; CHECK-NEXT:    st.b8 [%rd1+6], %rd12;
+; CHECK-NEXT:    shr.u64 %rd13, %rd2, 40;
+; CHECK-NEXT:    st.b8 [%rd1+5], %rd13;
+; CHECK-NEXT:    shr.u64 %rd14, %rd2, 32;
+; CHECK-NEXT:    st.b8 [%rd1+4], %rd14;
+; CHECK-NEXT:    shr.u64 %rd15, %rd2, 24;
+; CHECK-NEXT:    st.b8 [%rd1+3], %rd15;
+; CHECK-NEXT:    shr.u64 %rd16, %rd2, 16;
+; CHECK-NEXT:    st.b8 [%rd1+2], %rd16;
+; CHECK-NEXT:    shr.u64 %rd17, %rd2, 8;
+; CHECK-NEXT:    st.b8 [%rd1+1], %rd17;
+; CHECK-NEXT:    ret;
   store <4 x float> %v, ptr %p1, align 1
   ret void
 }
 
-; CHECK-LABEL: s2
 define void @s2(ptr %p1, <4 x float> %v) {
-; CHECK-NOT: st.v4
-; CHECK-NOT: st.v2
-; CHECK: st.b32
+; CHECK-LABEL: s2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [s2_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1];
+; CHECK-NEXT:    st.b32 [%rd1+8], %rd3;
+; CHECK-NEXT:    st.b32 [%rd1], %rd2;
+; CHECK-NEXT:    shr.u64 %rd4, %rd3, 32;
+; CHECK-NEXT:    st.b32 [%rd1+12], %rd4;
+; CHECK-NEXT:    shr.u64 %rd5, %rd2, 32;
+; CHECK-NEXT:    st.b32 [%rd1+4], %rd5;
+; CHECK-NEXT:    ret;
   store <4 x float> %v, ptr %p1, align 4
   ret void
 }
 
-; CHECK-LABEL: s3
 define void @s3(ptr %p1, <4 x float> %v) {
-; CHECK-NOT: st.v4
+; CHECK-LABEL: s3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [s3_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1];
+; CHECK-NEXT:    st.b64 [%rd1+8], %rd3;
+; CHECK-NEXT:    st.b64 [%rd1], %rd2;
+; CHECK-NEXT:    ret;
   store <4 x float> %v, ptr %p1, align 8
   ret void
 }
 
-; CHECK-LABEL: s4
 define void @s4(ptr %p1, <4 x float> %v) {
-; CHECK: st.v4
+; CHECK-LABEL: s4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [s4_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1];
+; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd2, %rd3};
+; CHECK-NEXT:    ret;
   store <4 x float> %v, ptr %p1, align 16
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index aa463b510fe84..c78fcddb7ed0f 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -8,23 +9,41 @@
 @gv_float2 = external constant <2 x float>
 @gv_float4 = external constant <4 x float>
 
-; CHECK-LABEL: test_gv_float()
 define float @test_gv_float() {
-; CHECK: ld.global.nc.b32
+; CHECK-LABEL: test_gv_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [gv_float];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %v = load float, ptr @gv_float
   ret float %v
 }
 
-; CHECK-LABEL: test_gv_float2()
 define <2 x float> @test_gv_float2() {
-; CHECK: ld.global.nc.v2.b32
+; CHECK-LABEL: test_gv_float2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.global.nc.b64 %rd1, [gv_float2];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
   %v = load <2 x float>, ptr @gv_float2
   ret <2 x float> %v
 }
 
-; CHECK-LABEL: test_gv_float4()
 define <4 x float> @test_gv_float4() {
-; CHECK: ld.global.nc.v4.b32
+; CHECK-LABEL: test_gv_float4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   %v = load <4 x float>, ptr @gv_float4
   ret <4 x float> %v
 }
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index e10949f95fac4..87f965c84b6b6 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -114,18 +114,19 @@ define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fadd_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<17>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0];
-; CHECK-NEXT:    add.rn.f32 %r9, %r1, 0f00000000;
-; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r2;
-; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r3;
-; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r5;
-; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r6;
-; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r7;
-; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    add.rn.f32 %r9, %r5, 0f00000000;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r6;
+; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r7;
+; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r8;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r1;
+; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r2;
+; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r3;
+; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
@@ -133,45 +134,89 @@ define float @reduce_fadd_float(<8 x float> %in) {
 }
 
 define float @reduce_fadd_float_reassoc(<8 x float> %in) {
-; CHECK-LABEL: reduce_fadd_float_reassoc(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<17>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-NEXT:    add.rn.f32 %r9, %r3, %r7;
-; CHECK-NEXT:    add.rn.f32 %r10, %r1, %r5;
-; CHECK-NEXT:    add.rn.f32 %r11, %r4, %r8;
-; CHECK-NEXT:    add.rn.f32 %r12, %r2, %r6;
-; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r11;
-; CHECK-NEXT:    add.rn.f32 %r14, %r10, %r9;
-; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r13;
-; CHECK-NEXT:    add.rn.f32 %r16, %r15, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
-; CHECK-NEXT:    ret;
+; CHECK-SM80-LABEL: reduce_fadd_float_reassoc(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b32 %r<17>;
+; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    add.rn.f32 %r9, %r7, %r3;
+; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r5, %r1;
+; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r8, %r4;
+; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    add.rn.f32 %r13, %r12, %r11;
+; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    add.rn.f32 %r15, %r14, %r13;
+; CHECK-SM80-NEXT:    add.rn.f32 %r16, %r15, 0f00000000;
+; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r16;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_float_reassoc(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b32 %r<5>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<10>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd5, %rd2, %rd4;
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd6, %rd1, %rd3;
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd7, %rd6, %rd5;
+; CHECK-SM100-NEXT:    mov.b64 {_, %r1}, %rd7;
+; CHECK-SM100-NEXT:    // implicit-def: %r2
+; CHECK-SM100-NEXT:    mov.b64 %rd8, {%r1, %r2};
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd9, %rd7, %rd8;
+; CHECK-SM100-NEXT:    mov.b64 {%r3, _}, %rd9;
+; CHECK-SM100-NEXT:    add.rn.f32 %r4, %r3, 0f00000000;
+; CHECK-SM100-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-SM100-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
   ret float %res
 }
 
 define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    add.rn.f32 %r8, %r3, %r7;
-; CHECK-NEXT:    add.rn.f32 %r9, %r1, %r5;
-; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT:    add.rn.f32 %r11, %r2, %r6;
-; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT:    add.rn.f32 %r13, %r10, %r12;
-; CHECK-NEXT:    add.rn.f32 %r14, %r13, 0f00000000;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT:    ret;
+; CHECK-SM80-LABEL: reduce_fadd_float_reassoc_nonpow2(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b32 %r<15>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT:    add.rn.f32 %r8, %r3, %r7;
+; CHECK-SM80-NEXT:    add.rn.f32 %r9, %r1, %r5;
+; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r9, %r8;
+; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r2, %r6;
+; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r11, %r4;
+; CHECK-SM80-NEXT:    add.rn.f32 %r13, %r10, %r12;
+; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r13, 0f00000000;
+; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r14;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fadd_float_reassoc_nonpow2(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b32 %r<13>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.b64 %rd1, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-SM100-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-SM100-NEXT:    ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd4, %rd2, %rd1;
+; CHECK-SM100-NEXT:    mov.b32 %r8, 0f80000000;
+; CHECK-SM100-NEXT:    mov.b64 %rd5, {%r7, %r8};
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd6, %rd3, %rd5;
+; CHECK-SM100-NEXT:    add.rn.f32x2 %rd7, %rd4, %rd6;
+; CHECK-SM100-NEXT:    mov.b64 {%r9, %r10}, %rd7;
+; CHECK-SM100-NEXT:    add.rn.f32 %r11, %r9, %r10;
+; CHECK-SM100-NEXT:    add.rn.f32 %r12, %r11, 0f00000000;
+; CHECK-SM100-NEXT:    st.param.b32 [func_retval0], %r12;
+; CHECK-SM100-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
   ret float %res
 }
@@ -273,17 +318,18 @@ define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmul_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0];
-; CHECK-NEXT:    mul.rn.f32 %r9, %r1, %r2;
-; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r3;
-; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r4;
-; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r5;
-; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r6;
-; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r7;
-; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r9, %r5, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r8;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r1;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
@@ -291,43 +337,85 @@ define float @reduce_fmul_float(<8 x float> %in) {
 }
 
 define float @reduce_fmul_float_reassoc(<8 x float> %in) {
-; CHECK-LABEL: reduce_fmul_float_reassoc(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<16>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-NEXT:    mul.rn.f32 %r9, %r3, %r7;
-; CHECK-NEXT:    mul.rn.f32 %r10, %r1, %r5;
-; CHECK-NEXT:    mul.rn.f32 %r11, %r4, %r8;
-; CHECK-NEXT:    mul.rn.f32 %r12, %r2, %r6;
-; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r11;
-; CHECK-NEXT:    mul.rn.f32 %r14, %r10, %r9;
-; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r13;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
-; CHECK-NEXT:    ret;
+; CHECK-SM80-LABEL: reduce_fmul_float_reassoc(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b32 %r<16>;
+; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    mul.rn.f32 %r9, %r7, %r3;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r5, %r1;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r8, %r4;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r13, %r12, %r11;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r15, %r14, %r13;
+; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r15;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_float_reassoc(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<10>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd5, %rd2, %rd4;
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd6, %rd1, %rd3;
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd7, %rd6, %rd5;
+; CHECK-SM100-NEXT:    mov.b64 {_, %r1}, %rd7;
+; CHECK-SM100-NEXT:    // implicit-def: %r2
+; CHECK-SM100-NEXT:    mov.b64 %rd8, {%r1, %r2};
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd9, %rd7, %rd8;
+; CHECK-SM100-NEXT:    mov.b64 {%r3, _}, %rd9;
+; CHECK-SM100-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-SM100-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
   ret float %res
 }
 
 define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
-; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<14>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
-; CHECK-NEXT:    mul.rn.f32 %r8, %r3, %r7;
-; CHECK-NEXT:    mul.rn.f32 %r9, %r1, %r5;
-; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r8;
-; CHECK-NEXT:    mul.rn.f32 %r11, %r2, %r6;
-; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r4;
-; CHECK-NEXT:    mul.rn.f32 %r13, %r10, %r12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
-; CHECK-NEXT:    ret;
+; CHECK-SM80-LABEL: reduce_fmul_float_reassoc_nonpow2(
+; CHECK-SM80:       {
+; CHECK-SM80-NEXT:    .reg .b32 %r<14>;
+; CHECK-SM80-EMPTY:
+; CHECK-SM80-NEXT:  // %bb.0:
+; CHECK-SM80-NEXT:    ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM80-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-SM80-NEXT:    mul.rn.f32 %r8, %r3, %r7;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r9, %r1, %r5;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r9, %r8;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r2, %r6;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r11, %r4;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r13, %r10, %r12;
+; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-SM80-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: reduce_fmul_float_reassoc_nonpow2(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b32 %r<12>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.b64 %rd1, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-SM100-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-SM100-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-SM100-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-SM100-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-SM100-NEXT:    ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd4, %rd2, %rd1;
+; CHECK-SM100-NEXT:    mov.b32 %r8, 0f3F800000;
+; CHECK-SM100-NEXT:    mov.b64 %rd5, {%r7, %r8};
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd6, %rd3, %rd5;
+; CHECK-SM100-NEXT:    mul.rn.f32x2 %rd7, %rd4, %rd6;
+; CHECK-SM100-NEXT:    mov.b64 {%r9, %r10}, %rd7;
+; CHECK-SM100-NEXT:    mul.rn.f32 %r11, %r9, %r10;
+; CHECK-SM100-NEXT:    st.param.b32 [func_retval0], %r11;
+; CHECK-SM100-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
   ret float %res
 }
@@ -403,15 +491,16 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmax_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    max.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    max.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -425,15 +514,16 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmax_float_reassoc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    max.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    max.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -535,15 +625,16 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmin_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    min.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0];
+; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    min.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -557,15 +648,16 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmin_float_reassoc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    min.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    min.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -667,15 +759,16 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmaximum_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0];
+; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -689,15 +782,16 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fmaximum_float_reassoc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -799,15 +893,16 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fminimum_float(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0];
+; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -821,15 +916,16 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ; CHECK-LABEL: reduce_fminimum_float_reassoc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r4, %r8;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r2, %r6;
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
 ; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r7;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r1, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index 765e50554c8d2..29939e323b4b1 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
@@ -5,75 +6,104 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define <16 x float> @test_v16f32(<16 x float> %a) {
 ; CHECK-LABEL: test_v16f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_12_15:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
-; CHECK-DAG: ld.param.v4.b32     {[[V_8_11:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
-; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
-; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
-; CHECK-DAG: st.param.v4.b32     [func_retval0+32], {[[V_8_11]]}
-; CHECK-DAG: st.param.v4.b32     [func_retval0+48], {[[V_12_15]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   ret <16 x float> %a
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: test_v8f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
-; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   ret <8 x float> %a
 }
 
 define <4 x float> @test_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: test_v4f32(
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
-; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   ret <4 x float> %a
 }
 
 define <2 x float> @test_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: test_v2f32(
-; CHECK-DAG: ld.param.v2.b32     {[[V_0_3:(%r[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
-; CHECK-DAG: st.param.v2.b32     [func_retval0],  {[[V_0_3]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
   ret <2 x float> %a
 }
 
 ; Oddly shaped vectors should not load any extra elements.
 define <3 x float> @test_v3f32(<3 x float> %a) {
 ; CHECK-LABEL: test_v3f32(
-; CHECK-DAG: ld.param.b32        [[V_2:%r[0-9]+]], [test_v3f32_param_0+8];
-; CHECK-DAG: ld.param.v2.b32     {[[V_0_1:(%r[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
-; CHECK-DAG: st.param.v2.b32     [func_retval0], {[[V_0_1]]}
-; CHECK-DAG: st.param.b32        [func_retval0+8], [[V_2]]
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_v3f32_param_0+8];
+; CHECK-NEXT:    st.param.b32 [func_retval0+8], %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
   ret <3 x float> %a
 }
 
 define <8 x i64> @test_v8i64(<8 x i64> %a) {
 ; CHECK-LABEL: test_v8i64(
-; CHECK-DAG: ld.param.v2.b64     {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
-; CHECK-DAG: ld.param.v2.b64     {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
-; CHECK-DAG: ld.param.v2.b64     {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
-; CHECK-DAG: ld.param.v2.b64     {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
-; CHECK-DAG: st.param.v2.b64     [func_retval0],  {[[V_0_1]]}
-; CHECK-DAG: st.param.v2.b64     [func_retval0+16], {[[V_2_3]]}
-; CHECK-DAG: st.param.v2.b64     [func_retval0+32], {[[V_4_5]]}
-; CHECK-DAG: st.param.v2.b64     [func_retval0+48], {[[V_6_7]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_v8i64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_v8i64_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [test_v8i64_param_0+32];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [test_v8i64_param_0+48];
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   ret <8 x i64> %a
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: test_v16i16(
-; CHECK-DAG: ld.param.v4.b32     {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
-; CHECK-DAG: ld.param.v4.b32     {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
-; CHECK-DAG: st.param.v4.b32     [func_retval0], {[[V_0_7]]}
-; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_8_15]]}
-; CHECK: ret;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16i16_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16i16_param_0+16];
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ret;
   ret <16 x i16> %a
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll
index b08c19206a0b8..17468d56aa574 100644
--- a/llvm/test/CodeGen/NVPTX/vector-args.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-args.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define float @foo(<2 x float> %a) {
-; CHECK: .func (.param .b32 func_retval0) foo
-; CHECK: .param .align 8 .b8 foo_param_0[8]
-; CHECK: ld.param.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [foo_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r3, %r2, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r4, %r1, %r1;
+; CHECK-NEXT:    add.rn.f32 %r5, %r4, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %t1 = fmul <2 x float> %a, %a
   %t2 = extractelement <2 x float> %t1, i32 0
   %t3 = extractelement <2 x float> %t1, i32 1
@@ -14,9 +23,17 @@ define float @foo(<2 x float> %a) {
 
 
 define float @bar(<4 x float> %a) {
-; CHECK: .func (.param .b32 func_retval0) bar
-; CHECK: .param .align 16 .b8 bar_param_0[16]
-; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-LABEL: bar(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [bar_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r5, %r2, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r6, %r1, %r1;
+; CHECK-NEXT:    add.rn.f32 %r7, %r6, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %t1 = fmul <4 x float> %a, %a
   %t2 = extractelement <4 x float> %t1, i32 0
   %t3 = extractelement <4 x float> %t1, i32 1
@@ -26,10 +43,18 @@ define float @bar(<4 x float> %a) {
 
 
 define <4 x float> @baz(<4 x float> %a) {
-; CHECK: .func  (.param .align 16 .b8 func_retval0[16]) baz
-; CHECK: .param .align 16 .b8 baz_param_0[16]
-; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-; CHECK: st.param.v4.b32 [func_retval0], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-LABEL: baz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [baz_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r5, %r4, %r4;
+; CHECK-NEXT:    mul.rn.f32 %r6, %r3, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r7, %r2, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r8, %r1, %r1;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    ret;
   %t1 = fmul <4 x float> %a, %a
   ret <4 x float> %t1
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 1ae6f6bcd748f..e16fc74325416 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -207,18 +207,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst
 ; CHECK-NEXT:    ld.param.b64 %rd1, [extv8f16_global_a16_param_0];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [extv8f16_global_a16_param_1];
 ; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r5, %rs8;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs7;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs6;
-; CHECK-NEXT:    cvt.f32.f16 %r8, %rs5;
-; CHECK-NEXT:    cvt.f32.f16 %r9, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r10, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %r12, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; CHECK-NEXT:    cvt.f32.f16 %r9, %rs6;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs8;
+; CHECK-NEXT:    cvt.f32.f16 %r12, %rs7;
 ; CHECK-NEXT:    st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
 ; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
@@ -271,18 +271,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia
 ; CHECK-NEXT:    ld.param.b64 %rd1, [extv8f16_generic_a16_param_0];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [extv8f16_generic_a16_param_1];
 ; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; CHECK-NEXT:    cvt.f32.f16 %r5, %rs8;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs7;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs6;
-; CHECK-NEXT:    cvt.f32.f16 %r8, %rs5;
-; CHECK-NEXT:    cvt.f32.f16 %r9, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r10, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs2;
-; CHECK-NEXT:    cvt.f32.f16 %r12, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs4;
+; CHECK-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r4;
+; CHECK-NEXT:    cvt.f32.f16 %r9, %rs6;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs5;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs8;
+; CHECK-NEXT:    cvt.f32.f16 %r12, %rs7;
 ; CHECK-NEXT:    st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9};
 ; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index f3b1015070085..d07c740d32a72 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -1,38 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK-LABEL: .visible .func foo1
-; CHECK: st.v2.b32
 define void @foo1(<2 x float> %val, ptr %ptr) {
+; CHECK-LABEL: foo1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [foo1_param_1];
+; CHECK-NEXT:    st.b64 [%rd2], %rd1;
+; CHECK-NEXT:    ret;
   store <2 x float> %val, ptr %ptr
   ret void
 }
 
-; CHECK-LABEL: .visible .func foo2
-; CHECK: st.v4.b32
 define void @foo2(<4 x float> %val, ptr %ptr) {
+; CHECK-LABEL: foo2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd3, [foo2_param_1];
+; CHECK-NEXT:    st.v2.b64 [%rd3], {%rd1, %rd2};
+; CHECK-NEXT:    ret;
   store <4 x float> %val, ptr %ptr
   ret void
 }
 
-; CHECK-LABEL: .visible .func foo3
-; CHECK: st.v2.b32
 define void @foo3(<2 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: foo3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [foo3_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo3_param_1];
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%r1, %r2};
+; CHECK-NEXT:    ret;
   store <2 x i32> %val, ptr %ptr
   ret void
 }
 
-; CHECK-LABEL: .visible .func foo4
-; CHECK: st.v4.b32
 define void @foo4(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: foo4(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo4_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo4_param_1];
+; CHECK-NEXT:    st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ret;
   store <4 x i32> %val, ptr %ptr
   ret void
 }
 
-; CHECK-LABEL: .visible .func v16i8
 define void @v16i8(ptr %a, ptr %b) {
-; CHECK: ld.v4.b32
-; CHECK: st.v4.b32
+; CHECK-LABEL: v16i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [v16i8_param_0];
+; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [v16i8_param_1];
+; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    ret;
   %v = load <16 x i8>, ptr %a
   store <16 x i8> %v, ptr %b
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
new file mode 100644
index 0000000000000..59173e22edf26
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
@@ -0,0 +1,246 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:    < %s | FileCheck %s --check-prefix=POWERPC_64LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN:  < %s | FileCheck %s --check-prefix=POWERPC_64
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN:   < %s | FileCheck %s --check-prefix=POWERPC_32
+
+define i32 @test_Greater_than(ptr %colauths, i32 signext %ncols) {
+; This testcase is manually reduced to isolate the critical code blocks.
+; It is designed to check for vector comparison specifically for zero vectors.
+; In the vector.body section, we are expecting a comparison instruction (vcmpequh), 
+; merge instructions (vmrghh and vmrglh) which use exactly 2 vectors. 
+; The output of the merge instruction is being used by xxland and finally 
+; accumulated by vadduwm instruction.
+
+; POWERPC_64LE-LABEL: test_Greater_than:
+; POWERPC_64LE:  .LBB0_6: # %vector.body
+; POWERPC_64LE-NEXT:    #
+; POWERPC_64LE-NEXT:    lxv [[R1:[0-9]+]], -64(4)
+; POWERPC_64LE-NEXT:    vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
+; POWERPC_64LE-NEXT:    xxlnor [[R1]], [[R1]], [[R1]]
+; POWERPC_64LE-NEXT:    vmrghh [[R4:[0-9]+]], [[R2]], [[R2]]
+; POWERPC_64LE-NEXT:    vmrglh [[R2]], [[R2]], [[R2]]
+; POWERPC_64LE-NEXT:    xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
+; POWERPC_64LE-NEXT:    xxland [[R1]], [[R1]], [[R6]]
+; POWERPC_64LE-NEXT:    vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_64LE:  .LBB0_10: # %vec.epilog.vector.body
+; POWERPC_64LE-NEXT:    #
+; POWERPC_64LE-NEXT:    lxv [[R8:[0-9]+]], 0(4)
+; POWERPC_64LE-NEXT:    addi 4, 4, 16
+; POWERPC_64LE-NEXT:    vcmpequh [[R9:[0-9]+]], [[R9]], [[R10:[0-9]+]]
+; POWERPC_64LE-NEXT:    xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_64LE-NEXT:    vmrglh [[R11:[0-9]+]], [[R9]], [[R9]]
+; POWERPC_64LE-NEXT:    vmrghh [[R9]], [[R9]], [[R9]]
+; POWERPC_64LE-NEXT:    xxland [[R12:[0-9]+]], [[R12]], [[R6]]
+; POWERPC_64LE-NEXT:    xxland [[R8]], [[R8]], [[R6]]
+; POWERPC_64LE-NEXT:    vadduwm [[R7]], [[R7]], [[R9]]
+; POWERPC_64LE-NEXT:    vadduwm [[R3]], [[R3]], [[R11]]
+; POWERPC_64LE-NEXT:    bdnz .LBB0_10
+; POWERPC_64LE:    blr
+;
+; POWERPC_64-LABEL: test_Greater_than:
+; POWERPC_64:  L..BB0_6: # %vector.body
+; POWERPC_64-NEXT:    #
+; POWERPC_64-NEXT:    lxv [[R1:[0-9]+]], -64(4)
+; POWERPC_64-NEXT:    vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
+; POWERPC_64-NEXT:    xxlnor [[R1]], [[R1]], [[R1]]
+; POWERPC_64-NEXT:    vmrglh [[R4:[0-9]+]], [[R2]], [[R2]]
+; POWERPC_64-NEXT:    vmrghh [[R2]], [[R2]], [[R2]]
+; POWERPC_64-NEXT:    xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
+; POWERPC_64-NEXT:    xxland [[R1]], [[R1]], [[R6]]
+; POWERPC_64-NEXT:    vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_64:  L..BB0_10: # %vec.epilog.vector.body
+; POWERPC_64-NEXT:    #
+; POWERPC_64-NEXT:    lxv [[R8:[0-9]+]], 0(4)
+; POWERPC_64-NEXT:    addi 4, 4, 16
+; POWERPC_64-NEXT:    vcmpequh [[R9:[0-9]+]], [[R9]], [[R10:[0-9]+]]
+; POWERPC_64-NEXT:    xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_64-NEXT:    vmrghh [[R11:[0-9]+]], [[R9]], [[R9]]
+; POWERPC_64-NEXT:    vmrglh [[R9]], [[R9]], [[R9]]
+; POWERPC_64-NEXT:    xxland [[R12:[0-9]+]], [[R12]], [[R6]]
+; POWERPC_64-NEXT:    xxland [[R8]], [[R8]], [[R6]]
+; POWERPC_64-NEXT:    vadduwm [[R7]], [[R7]], [[R9]]
+; POWERPC_64-NEXT:    vadduwm [[R3]], [[R3]], [[R11]]
+; POWERPC_64-NEXT:    bdnz L..BB0_10
+; POWERPC_64:    blr
+;
+; POWERPC_32-LABEL: test_Greater_than:
+; POWERPC_32:  L..BB0_7: # %vector.body
+; POWERPC_32-NEXT:    #
+; POWERPC_32-NEXT:    lxv [[R1:[0-9]+]], 0(10)
+; POWERPC_32-NEXT:    addic [[R13:[0-9]+]], [[R13]], 64
+; POWERPC_32-NEXT:    addze [[R14:[0-9]+]], [[R14]]
+; POWERPC_32-NEXT:    xor [[R15:[0-9]+]], [[R13]], [[R16:[0-9]+]]
+; POWERPC_32-NEXT:    or. [[R15]], [[R15]], [[R14]]
+; POWERPC_32-NEXT:    vcmpequh [[R2:[0-9]+]], [[R2]], [[R3:[0-9]+]]
+; POWERPC_32-NEXT:    xxlnor [[R1]], [[R1]], [[R1]]
+; POWERPC_32-NEXT:    vmrglh [[R4:[0-9]+]], [[R2]], [[R2]]
+; POWERPC_32-NEXT:    vmrghh [[R2]], [[R2]], [[R2]]
+; POWERPC_32-NEXT:    xxland [[R5:[0-9]+]], [[R5]], [[R6:[0-9]+]]
+; POWERPC_32-NEXT:    xxland [[R1]], [[R1]], [[R6]]
+; POWERPC_32-NEXT:    vadduwm [[R7:[0-9]+]], [[R7]], [[R4]]
+; POWERPC_32:  L..BB0_11: # %vec.epilog.vector.body
+; POWERPC_32-NEXT:    #
+; POWERPC_32-NEXT:    slwi [[R14]], [[R13]], 1
+; POWERPC_32-NEXT:    addic [[R13]], [[R13]], 8
+; POWERPC_32-NEXT:    addze [[R17:[0-9]+]], [[R17]]
+; POWERPC_32-NEXT:    lxvx [[R8:[0-9]+]], [[R18:[0-9]+]], [[R14]]
+; POWERPC_32-NEXT:    xor [[R14]], [[R13]], [[R16]]
+; POWERPC_32-NEXT:    or. [[R14]], [[R14]], [[R17]]
+; POWERPC_32-NEXT:    vcmpequh [[R9:[0-9]+]], [[R9]], [[R3]]
+; POWERPC_32-NEXT:    xxlnor [[R8]], [[R8]], [[R8]]
+; POWERPC_32-NEXT:    vmrghh [[R11:[0-9]+]], [[R9]], [[R9]]
+; POWERPC_32-NEXT:    vmrglh [[R9]], [[R9]], [[R9]]
+; POWERPC_32-NEXT:    xxland [[R12:[0-9]+]], [[R12]], [[R6]]
+; POWERPC_32-NEXT:    xxland [[R8]], [[R8]], [[R6]]
+; POWERPC_32-NEXT:    vadduwm [[R7]], [[R7]], [[R9]]
+; POWERPC_32-NEXT:    vadduwm [[R19:[0-9]+]], [[R19]], [[R11]]
+; POWERPC_32-NEXT:    bne 0, L..BB0_11
+; POWERPC_32:    blr
+    entry:
+  %cmp5 = icmp sgt i32 %ncols, 0
+  br i1 %cmp5, label %iter.check, label %for.cond.cleanup
+
+iter.check:                                       ; preds = %entry
+  %wide.trip.count = zext nneg i32 %ncols to i64
+  %min.iters.check = icmp ult i32 %ncols, 8
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+for.body.preheader:                               ; preds = %vec.epilog.iter.check, %vec.epilog.middle.block, %iter.check
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec31, %vec.epilog.middle.block ]
+  %num_cols_needed.06.ph = phi i32 [ 0, %iter.check ], [ %33, %vec.epilog.iter.check ], [ %40, %vec.epilog.middle.block ]
+  br label %for.body
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %ncols, 64
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.vec = and i64 %wide.trip.count, 2147483584
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %24, %vector.body ]
+  %vec.phi10 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %25, %vector.body ]
+  %vec.phi11 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %26, %vector.body ]
+  %vec.phi12 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %27, %vector.body ]
+  %vec.phi13 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %28, %vector.body ]
+  %vec.phi14 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %29, %vector.body ]
+  %vec.phi15 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %30, %vector.body ]
+  %vec.phi16 = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ %31, %vector.body ]
+  %0 = getelementptr inbounds nuw i16, ptr %colauths, i64 %index
+  %1 = getelementptr inbounds nuw i8, ptr %0, i64 16
+  %2 = getelementptr inbounds nuw i8, ptr %0, i64 32
+  %3 = getelementptr inbounds nuw i8, ptr %0, i64 48
+  %4 = getelementptr inbounds nuw i8, ptr %0, i64 64
+  %5 = getelementptr inbounds nuw i8, ptr %0, i64 80
+  %6 = getelementptr inbounds nuw i8, ptr %0, i64 96
+  %7 = getelementptr inbounds nuw i8, ptr %0, i64 112
+  %wide.load = load <8 x i16>, ptr %0, align 2, !tbaa !5
+  %wide.load17 = load <8 x i16>, ptr %1, align 2, !tbaa !5
+  %wide.load18 = load <8 x i16>, ptr %2, align 2, !tbaa !5
+  %wide.load19 = load <8 x i16>, ptr %3, align 2, !tbaa !5
+  %wide.load20 = load <8 x i16>, ptr %4, align 2, !tbaa !5
+  %wide.load21 = load <8 x i16>, ptr %5, align 2, !tbaa !5
+  %wide.load22 = load <8 x i16>, ptr %6, align 2, !tbaa !5
+  %wide.load23 = load <8 x i16>, ptr %7, align 2, !tbaa !5
+  %8 = icmp ne <8 x i16> %wide.load, zeroinitializer
+  %9 = icmp ne <8 x i16> %wide.load17, zeroinitializer
+  %10 = icmp ne <8 x i16> %wide.load18, zeroinitializer
+  %11 = icmp ne <8 x i16> %wide.load19, zeroinitializer
+  %12 = icmp ne <8 x i16> %wide.load20, zeroinitializer
+  %13 = icmp ne <8 x i16> %wide.load21, zeroinitializer
+  %14 = icmp ne <8 x i16> %wide.load22, zeroinitializer
+  %15 = icmp ne <8 x i16> %wide.load23, zeroinitializer
+  %16 = zext <8 x i1> %8 to <8 x i32>
+  %17 = zext <8 x i1> %9 to <8 x i32>
+  %18 = zext <8 x i1> %10 to <8 x i32>
+  %19 = zext <8 x i1> %11 to <8 x i32>
+  %20 = zext <8 x i1> %12 to <8 x i32>
+  %21 = zext <8 x i1> %13 to <8 x i32>
+  %22 = zext <8 x i1> %14 to <8 x i32>
+  %23 = zext <8 x i1> %15 to <8 x i32>
+  %24 = add <8 x i32> %vec.phi, %16
+  %25 = add <8 x i32> %vec.phi10, %17
+  %26 = add <8 x i32> %vec.phi11, %18
+  %27 = add <8 x i32> %vec.phi12, %19
+  %28 = add <8 x i32> %vec.phi13, %20
+  %29 = add <8 x i32> %vec.phi14, %21
+  %30 = add <8 x i32> %vec.phi15, %22
+  %31 = add <8 x i32> %vec.phi16, %23
+  %index.next = add nuw i64 %index, 64
+  %32 = icmp eq i64 %index.next, %n.vec
+  br i1 %32, label %middle.block, label %vector.body, !llvm.loop !9
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <8 x i32> %25, %24
+  %bin.rdx24 = add <8 x i32> %26, %bin.rdx
+  %bin.rdx25 = add <8 x i32> %27, %bin.rdx24
+  %bin.rdx26 = add <8 x i32> %28, %bin.rdx25
+  %bin.rdx27 = add <8 x i32> %29, %bin.rdx26
+  %bin.rdx28 = add <8 x i32> %30, %bin.rdx27
+  %bin.rdx29 = add <8 x i32> %31, %bin.rdx28
+  %33 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bin.rdx29)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %n.vec.remaining = and i64 %wide.trip.count, 56
+  %min.epilog.iters.check = icmp eq i64 %n.vec.remaining, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vec.epilog.iter.check, %vector.main.loop.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i32 [ %33, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec31 = and i64 %wide.trip.count, 2147483640
+  %34 = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %bc.merge.rdx, i64 0
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index32 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next35, %vec.epilog.vector.body ]
+  %vec.phi33 = phi <8 x i32> [ %34, %vec.epilog.ph ], [ %38, %vec.epilog.vector.body ]
+  %35 = getelementptr inbounds nuw i16, ptr %colauths, i64 %index32
+  %wide.load34 = load <8 x i16>, ptr %35, align 2, !tbaa !5
+  %36 = icmp ne <8 x i16> %wide.load34, zeroinitializer
+  %37 = zext <8 x i1> %36 to <8 x i32>
+  %38 = add <8 x i32> %vec.phi33, %37
+  %index.next35 = add nuw i64 %index32, 8
+  %39 = icmp eq i64 %index.next35, %n.vec31
+  br i1 %39, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !llvm.loop !13
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %40 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %38)
+  %cmp.n36 = icmp eq i64 %n.vec31, %wide.trip.count
+  br i1 %cmp.n36, label %for.cond.cleanup, label %for.body.preheader
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %num_cols_needed.0.lcssa = phi i32 [ 0, %entry ], [ %33, %middle.block ], [ %40, %vec.epilog.middle.block ], [ %spec.select, %for.body ]
+  ret i32 %num_cols_needed.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %num_cols_needed.06 = phi i32 [ %spec.select, %for.body ], [ %num_cols_needed.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i16, ptr %colauths, i64 %indvars.iv
+  %41 = load i16, ptr %arrayidx, align 2, !tbaa !5
+  %tobool.not = icmp ne i16 %41, 0
+  %inc = zext i1 %tobool.not to i32
+  %spec.select = add nuw nsw i32 %num_cols_needed.06, %inc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !14
+}
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"short", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = distinct !{!9, !10, !11, !12}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = !{!"llvm.loop.isvectorized", i32 1}
+!12 = !{!"llvm.loop.unroll.runtime.disable"}
+!13 = distinct !{!13, !10, !11, !12}
+!14 = distinct !{!14, !10, !12, !11}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 30e455f57737b..82cc6829838a0 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -624,6 +624,9 @@
 # DEBUG-NEXT: G_RESET_FPMODE (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_GET_ROUNDING (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_PTR_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
index 22956f8fe3551..9d3fe3a90b463 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -47,9 +47,9 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b)
 define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_select_swapped:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmnot.m v0, v0
-; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
@@ -74,9 +74,9 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4
 define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_both_swapped:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmnot.m v0, v0
-; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
+; CHECK-NEXT:    vmerge.vvm v9, v10, v9, v0
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 0aa0cbceefc76..317ad0c124e73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1022,6 +1022,111 @@ define <vscale x 4 x i1> @vmadc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b,
   ret <vscale x 4 x i1> %2
 }
 
+define <vscale x 4 x i1> @vmadc_vim(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmadc_vim:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmadc.vim v11, v8, 5, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmadc_vim:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vmadc.vim v11, v8, 5, v0
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; VLOPT-NEXT:    vmand.mm v0, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
+  ret <vscale x 4 x i1> %2
+}
+
+define <vscale x 4 x i1> @vmadc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmadc_vxm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmadc.vxm v11, v8, a0, v0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmadc_vxm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vmadc.vxm v11, v8, a0, v0
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; VLOPT-NEXT:    vmand.mm v0, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
+  ret <vscale x 4 x i1> %2
+}
+
+define <vscale x 4 x i1> @vmadc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmadc_vvm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmadc.vvm v11, v8, v12, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmadc_vvm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vmadc.vvm v11, v8, v12, v0
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; VLOPT-NEXT:    vmand.mm v0, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
+  ret <vscale x 4 x i1> %2
+}
+
+define <vscale x 4 x i1> @vmsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, <vscale x 4 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmsbc_vvm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmsbc.vvm v11, v8, v12, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmsbc_vvm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vmsbc.vvm v11, v8, v12, v0
+; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; VLOPT-NEXT:    vmand.mm v0, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
+  ret <vscale x 4 x i1> %2
+}
+
+define <vscale x 4 x i1> @vmsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vmsbc_vxm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmsbc.vxm v11, v8, a0, v0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; NOVLOPT-NEXT:    vmand.mm v0, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmsbc_vxm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vmsbc.vxm v11, v8, a0, v0
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; VLOPT-NEXT:    vmand.mm v0, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i1> @llvm.riscv.vmand.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %b, iXLen %vl)
+  ret <vscale x 4 x i1> %2
+}
+
 define <vscale x 4 x i1> @vmsbc_vx(<vscale x 4 x i32> %a, <vscale x 4 x i1> %b, i32 %c, iXLen %vl) {
 ; NOVLOPT-LABEL: vmsbc_vx:
 ; NOVLOPT:       # %bb.0:
@@ -5297,6 +5402,153 @@ define <vscale x 4 x float> @vfwmaccbf16_vv(<vscale x 4 x float> %a, <vscale x 4
   ret <vscale x 4 x float> %2
 }
 
+define <vscale x 4 x i32> @vsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vsbc_vvm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsbc.vvm v8, v8, v10, v0
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsbc_vvm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsbc.vvm v8, v8, v10, v0
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %c, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vsbc_vxm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsbc.vxm v8, v8, a0, v0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsbc_vxm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsbc.vxm v8, v8, a0, v0
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vfclass_v(<vscale x 4 x float> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vfclass_v:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vfclass.v v8, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vfclass_v:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vfclass.v v8, v8
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vfclass.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x float> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrgather_vi(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrgather_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrgather.vi v12, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrgather_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vi v12, v8, 5
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrgather_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrgather_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrgather.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrgather_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrgather_vx(<vscale x 4 x i32> %a, iXLen %idx, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrgather_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrgather.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrgather_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %idx, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrgatherei16_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrgatherei16_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrgatherei16_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
 define <vscale x 4 x float> @vfwmaccbf16_vf(<vscale x 4 x float> %a, bfloat %b, <vscale x 4 x bfloat> %c, <vscale x 4 x float> %d, iXLen %vl) {
 ; NOVLOPT-LABEL: vfwmaccbf16_vf:
 ; NOVLOPT:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
index 03204468dc14c..0b95e558d8236 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
@@ -55,3 +55,42 @@ body: |
     %mask:vmv0 = COPY $v0
     %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
 ...
+---
+name: vnclip_move_past_passthru
+body: |
+  bb.0:
+    liveins: $x8, $v0, $v8
+    ; CHECK-LABEL: name: vnclip_move_past_passthru
+    ; CHECK: liveins: $x8, $v0, $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %avl:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVNCLIPU_WV_MF2_MASK %passthru, $noreg, $noreg, %mask, 0, %avl, 5 /* e32 */, 0 /* tu, mu */, implicit-def $vxsat
+    %avl:gprnox0 = COPY $x8
+    %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
+    %passthru:vrnov0 = COPY $v8
+    %mask:vmv0 = COPY $v0
+    %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
+...
+---
+name: vnclip_cant_move_past_passthru
+body: |
+  bb.0:
+    liveins: $x8, $v0, $v8
+    ; CHECK-LABEL: name: vnclip_cant_move_past_passthru
+    ; CHECK: liveins: $x8, $v0, $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %avl:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5 /* e32 */, 3 /* ta, ma */, implicit-def $vxsat
+    ; CHECK-NEXT: %vxsat:gpr = COPY $vxsat
+    ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
+    %avl:gprnox0 = COPY $x8
+    %x:vr = PseudoVNCLIPU_WV_MF2 $noreg, $noreg, $noreg, 0, -1, 5, 3, implicit-def $vxsat
+    %vxsat:gpr = COPY $vxsat
+    %passthru:vrnov0 = COPY $v8
+    %mask:vmv0 = COPY $v0
+    %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */
+...
diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
index d9f9ad379ee95..59f4d95f45acc 100644
--- a/llvm/test/CodeGen/RISCV/select-cond.ll
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -5,6 +5,8 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-THEAD
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-XQCICM
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32-XQCICS
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV64
 ; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \
@@ -35,6 +37,12 @@ define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 sign
 ; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_trunc:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    andi a0, a0, 1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a1, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_trunc:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    andi a3, a0, 1
@@ -80,6 +88,12 @@ define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signe
 ; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_param:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    andi a0, a0, 1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a1, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_param:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    andi a3, a0, 1
@@ -122,6 +136,13 @@ define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_eq:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    xor a0, a0, a1
+; RV32-XQCICS-NEXT:    seqz a0, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_eq:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    beq a0, a1, .LBB2_2
@@ -164,6 +185,13 @@ define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_ne:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    xor a0, a0, a1
+; RV32-XQCICS-NEXT:    snez a0, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_ne:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bne a0, a1, .LBB3_2
@@ -206,6 +234,12 @@ define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_ugt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu a0, a1, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_ugt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bltu a1, a0, .LBB4_2
@@ -248,6 +282,12 @@ define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_uge:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu a0, a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a3, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_uge:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bgeu a0, a1, .LBB5_2
@@ -290,6 +330,12 @@ define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_ult:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu a0, a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_ult:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bltu a0, a1, .LBB6_2
@@ -332,6 +378,12 @@ define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_ule:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu a0, a1, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a3, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_ule:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bgeu a1, a0, .LBB7_2
@@ -374,6 +426,12 @@ define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_sgt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt a0, a1, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_sgt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    blt a1, a0, .LBB8_2
@@ -416,6 +474,12 @@ define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_sge:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt a0, a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a3, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_sge:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bge a0, a1, .LBB9_2
@@ -458,6 +522,12 @@ define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a3
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_slt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt a0, a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a3
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_slt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    blt a0, a1, .LBB10_2
@@ -500,6 +570,12 @@ define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %
 ; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i32_sle:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt a0, a1, a0
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a3, a2
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i32_sle:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bge a1, a0, .LBB11_2
@@ -550,6 +626,14 @@ define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_trunc:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    andi a1, a0, 1
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a3, a5
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a2, a4
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_trunc:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    andi a3, a0, 1
@@ -601,6 +685,15 @@ define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_param:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    andi a5, a0, 1
+; RV32-XQCICS-NEXT:    mv a0, a5
+; RV32-XQCICS-NEXT:    qc.selectnei a5, 0, a2, a4
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a1, a3
+; RV32-XQCICS-NEXT:    mv a1, a5
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_param:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    andi a3, a0, 1
@@ -657,6 +750,16 @@ define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_eq:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    xor a0, a0, a2
+; RV32-XQCICS-NEXT:    or a1, a0, a1
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selecteqi a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selecteqi a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_eq:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    beq a0, a1, .LBB14_2
@@ -713,6 +816,16 @@ define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_ne:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    xor a0, a0, a2
+; RV32-XQCICS-NEXT:    or a1, a0, a1
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_ne:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bne a0, a1, .LBB15_2
@@ -774,6 +887,18 @@ define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_ugt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu t0, a3, a1
+; RV32-XQCICS-NEXT:    sltu a0, a2, a0
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_ugt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bltu a1, a0, .LBB16_2
@@ -835,6 +960,18 @@ define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_uge:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu t0, a1, a3
+; RV32-XQCICS-NEXT:    sltu a0, a0, a2
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a6, a4
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a7, a5
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_uge:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bgeu a0, a1, .LBB17_2
@@ -896,6 +1033,18 @@ define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_ult:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu t0, a1, a3
+; RV32-XQCICS-NEXT:    sltu a0, a0, a2
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_ult:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bltu a0, a1, .LBB18_2
@@ -957,6 +1106,18 @@ define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_ule:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    sltu t0, a3, a1
+; RV32-XQCICS-NEXT:    sltu a0, a2, a0
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a6, a4
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a7, a5
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_ule:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bgeu a1, a0, .LBB19_2
@@ -1018,6 +1179,18 @@ define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_sgt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt t0, a3, a1
+; RV32-XQCICS-NEXT:    sltu a0, a2, a0
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_sgt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    blt a1, a0, .LBB20_2
@@ -1079,6 +1252,18 @@ define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_sge:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt t0, a1, a3
+; RV32-XQCICS-NEXT:    sltu a0, a0, a2
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a6, a4
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a7, a5
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_sge:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bge a0, a1, .LBB21_2
@@ -1140,6 +1325,18 @@ define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_slt:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt t0, a1, a3
+; RV32-XQCICS-NEXT:    sltu a0, a0, a2
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a4, a6
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a5, a7
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_slt:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    blt a0, a1, .LBB22_2
@@ -1201,6 +1398,18 @@ define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    mv a1, a7
 ; RV32-XQCICM-NEXT:    ret
 ;
+; RV32-XQCICS-LABEL: select_i64_sle:
+; RV32-XQCICS:       # %bb.0:
+; RV32-XQCICS-NEXT:    slt t0, a3, a1
+; RV32-XQCICS-NEXT:    sltu a0, a2, a0
+; RV32-XQCICS-NEXT:    xor a1, a1, a3
+; RV32-XQCICS-NEXT:    seqz a1, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a0, t0
+; RV32-XQCICS-NEXT:    mv a0, a1
+; RV32-XQCICS-NEXT:    qc.selectnei a0, 0, a6, a4
+; RV32-XQCICS-NEXT:    qc.selectnei a1, 0, a7, a5
+; RV32-XQCICS-NEXT:    ret
+;
 ; RV64-LABEL: select_i64_sle:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    bge a1, a0, .LBB23_2
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index 1a978d1a0fcac..9c8230572b926 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zknh,+v -target-abi=lp64f \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64I
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba,+zbb,+f,+zknh,+v -target-abi=lp64f \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba,+zbb,+zbkb,+f,+zknh,+v -target-abi=lp64f \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64ZBB
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba,+zbb,+f,+zknh,+v -target-abi=lp64f \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba,+zbb,+zbkb,+f,+zknh,+v -target-abi=lp64f \
 ; RUN:   -riscv-disable-sextw-removal | FileCheck %s --check-prefix=NOREMOVAL
 
 define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
@@ -1499,3 +1499,186 @@ bb7:                                              ; preds = %bb2
 }
 
 declare i32 @llvm.riscv.vmv.x.s.nxv1i32( <vscale x 1 x i32>)
+
+; Test that we can look through brev8 in hasAllNBitUsers.
+define signext i32 @test21(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; RV64I-LABEL: test21:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 209715
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    addi a4, a4, 819
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 256
+; RV64I-NEXT:  .LBB25_1: # %bb2
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    srli a6, a0, 4
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a6, a6, a3
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    srli a6, a0, 2
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    and a6, a6, a4
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    andi a6, a0, 65
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    slli a6, a6, 1
+; RV64I-NEXT:    andi a0, a0, 1104
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    addi a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    bltu a2, a5, .LBB25_1
+; RV64I-NEXT:  # %bb.2: # %bb7
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test21:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    addi a2, a2, -1
+; RV64ZBB-NEXT:    li a3, 256
+; RV64ZBB-NEXT:  .LBB25_1: # %bb2
+; RV64ZBB-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64ZBB-NEXT:    brev8 a0, a0
+; RV64ZBB-NEXT:    andi a0, a0, 1234
+; RV64ZBB-NEXT:    addi a2, a2, 1
+; RV64ZBB-NEXT:    addw a0, a0, a1
+; RV64ZBB-NEXT:    bltu a2, a3, .LBB25_1
+; RV64ZBB-NEXT:  # %bb.2: # %bb7
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test21:
+; NOREMOVAL:       # %bb.0: # %entry
+; NOREMOVAL-NEXT:    addi a2, a2, -1
+; NOREMOVAL-NEXT:    li a3, 256
+; NOREMOVAL-NEXT:  .LBB25_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    brev8 a0, a0
+; NOREMOVAL-NEXT:    andi a0, a0, 1234
+; NOREMOVAL-NEXT:    addi a2, a2, 1
+; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    bltu a2, a3, .LBB25_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %bswap = call i64 @llvm.bswap.i64(i64 %i1)
+  %bitreverse = call i64 @llvm.bitreverse.i64(i64 %bswap)
+  %i4 = and i64 %bitreverse, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+; Negative test for looking through brev8. Make sure we consider that it works
+; on bytes.
+define signext i32 @test22(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; RV64I-LABEL: test22:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    lui a3, %hi(.LCPI26_0)
+; RV64I-NEXT:    lui a4, %hi(.LCPI26_1)
+; RV64I-NEXT:    lui a5, %hi(.LCPI26_2)
+; RV64I-NEXT:    lui a6, %hi(.LCPI26_3)
+; RV64I-NEXT:    li a7, 69
+; RV64I-NEXT:    ld a3, %lo(.LCPI26_0)(a3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI26_1)(a4)
+; RV64I-NEXT:    ld a5, %lo(.LCPI26_2)(a5)
+; RV64I-NEXT:    ld a6, %lo(.LCPI26_3)(a6)
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    li t0, 65
+; RV64I-NEXT:    slli t0, t0, 28
+; RV64I-NEXT:    li t1, 256
+; RV64I-NEXT:  .LBB26_1: # %bb2
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    slli t2, a0, 11
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    and t2, t2, a3
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    or a0, a0, t2
+; RV64I-NEXT:    srli t2, a0, 2
+; RV64I-NEXT:    and a0, a0, a6
+; RV64I-NEXT:    and t2, t2, a5
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    or a0, t2, a0
+; RV64I-NEXT:    srli t2, a0, 1
+; RV64I-NEXT:    and a0, a0, t0
+; RV64I-NEXT:    and t2, t2, a7
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    or a0, t2, a0
+; RV64I-NEXT:    srli a0, a0, 28
+; RV64I-NEXT:    addi a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    bltu a2, t1, .LBB26_1
+; RV64I-NEXT:  # %bb.2: # %bb7
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: test22:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    addi a2, a2, -1
+; RV64ZBB-NEXT:    li a3, 256
+; RV64ZBB-NEXT:  .LBB26_1: # %bb2
+; RV64ZBB-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64ZBB-NEXT:    slli a0, a0, 7
+; RV64ZBB-NEXT:    brev8 a0, a0
+; RV64ZBB-NEXT:    srli a0, a0, 28
+; RV64ZBB-NEXT:    andi a0, a0, 1234
+; RV64ZBB-NEXT:    addi a2, a2, 1
+; RV64ZBB-NEXT:    add a0, a0, a1
+; RV64ZBB-NEXT:    bltu a2, a3, .LBB26_1
+; RV64ZBB-NEXT:  # %bb.2: # %bb7
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    ret
+;
+; NOREMOVAL-LABEL: test22:
+; NOREMOVAL:       # %bb.0: # %entry
+; NOREMOVAL-NEXT:    addi a2, a2, -1
+; NOREMOVAL-NEXT:    li a3, 256
+; NOREMOVAL-NEXT:  .LBB26_1: # %bb2
+; NOREMOVAL-NEXT:    # =>This Inner Loop Header: Depth=1
+; NOREMOVAL-NEXT:    slli a0, a0, 7
+; NOREMOVAL-NEXT:    brev8 a0, a0
+; NOREMOVAL-NEXT:    srli a0, a0, 28
+; NOREMOVAL-NEXT:    andi a0, a0, 1234
+; NOREMOVAL-NEXT:    addi a2, a2, 1
+; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    bltu a2, a3, .LBB26_1
+; NOREMOVAL-NEXT:  # %bb.2: # %bb7
+; NOREMOVAL-NEXT:    sext.w a0, a0
+; NOREMOVAL-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %shl = shl i64 %i1, 7
+  %bswap = call i64 @llvm.bswap.i64(i64 %shl)
+  %bitreverse = call i64 @llvm.bitreverse.i64(i64 %bswap)
+  %lshr = lshr i64 %bitreverse, 28
+  %i4 = and i64 %lshr, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
new file mode 100644
index 0000000000000..4cee0910608f3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32IM
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-xqciac -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32IMXQCIAC
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-xqciac,+zba -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefix=RV32IZBAMXQCIAC
+
+define dso_local i32 @mul(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: mul:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a0, a1, 5
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: mul:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    li a0, 33
+; RV32IMXQCIAC-NEXT:    mul a0, a1, a0
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: mul:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    li a0, 33
+; RV32IZBAMXQCIAC-NEXT:    mul a0, a1, a0
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 33
+  ret i32 %mul
+}
+
+define dso_local i32 @muliadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: muliadd:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    li a2, 165
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: muliadd:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, 165
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: muliadd:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, 165
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 165
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @muliadd2(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: muliadd2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    li a2, 1111
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: muliadd2:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, 1111
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: muliadd2:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, 1111
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 1111
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @muliadd_neg(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: muliadd_neg:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    li a2, -165
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: muliadd_neg:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, -165
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: muliadd_neg:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, -165
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, -165
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @muliadd_neg2(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: muliadd_neg2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    li a2, -2045
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: muliadd_neg2:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, -2045
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: muliadd_neg2:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, -2045
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, -2045
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @pow2immplus1(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: pow2immplus1:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a2, a1, 5
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: pow2immplus1:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, 33
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: pow2immplus1:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, 33
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 33
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @pow2immminus2(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: pow2immminus2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a2, a1, 1
+; RV32IM-NEXT:    slli a1, a1, 7
+; RV32IM-NEXT:    sub a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: pow2immminus2:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, 126
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: pow2immminus2:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, 126
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 126
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @pow2minuspow2(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: pow2minuspow2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a2, a1, 7
+; RV32IM-NEXT:    slli a1, a1, 9
+; RV32IM-NEXT:    sub a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: pow2minuspow2:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    qc.muliadd a0, a1, 384
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: pow2minuspow2:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a0, a1, 384
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 384
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @gtsimm12(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: gtsimm12:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    lui a2, 1
+; RV32IM-NEXT:    addi a2, a2, 477
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: gtsimm12:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    lui a2, 1
+; RV32IMXQCIAC-NEXT:    addi a2, a2, 477
+; RV32IMXQCIAC-NEXT:    mul a1, a1, a2
+; RV32IMXQCIAC-NEXT:    add a0, a0, a1
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: gtsimm12:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    lui a2, 1
+; RV32IZBAMXQCIAC-NEXT:    addi a2, a2, 477
+; RV32IZBAMXQCIAC-NEXT:    mul a1, a1, a2
+; RV32IZBAMXQCIAC-NEXT:    add a0, a0, a1
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 4573
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+; NOTE: This will become qc.shladd once support is added
+define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: pow2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a1, a1, 5
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: pow2:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    slli a1, a1, 5
+; RV32IMXQCIAC-NEXT:    add a0, a0, a1
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: pow2:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    slli a1, a1, 5
+; RV32IZBAMXQCIAC-NEXT:    add a0, a0, a1
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 32
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+
+define dso_local i32 @shxadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; RV32IM-LABEL: shxadd:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    slli a1, a1, 1
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: shxadd:
+; RV32IMXQCIAC:       # %bb.0: # %entry
+; RV32IMXQCIAC-NEXT:    slli a1, a1, 1
+; RV32IMXQCIAC-NEXT:    add a0, a0, a1
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: shxadd:
+; RV32IZBAMXQCIAC:       # %bb.0: # %entry
+; RV32IZBAMXQCIAC-NEXT:    sh1add a0, a1, a0
+; RV32IZBAMXQCIAC-NEXT:    ret
+entry:
+  %mul = mul nsw i32 %b, 2
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
new file mode 100644
index 0000000000000..0e90b1fda0ea2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test that we are able to generate the Xqcics instructions
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICS
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics,+experimental-xqcicm -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICS
+
+define i32 @select_cc_example_eq_s1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_s1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a0, 1
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    bnez a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 12
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq_s1:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    andi a0, a0, 1
+; RV32IXQCICS-NEXT:    qc.selectinei a0, 0, a2, 12
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cond_trunc = trunc i32 %a to i1
+  %sel = select i1 %cond_trunc, i32 %x, i32 12
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_s2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_s2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a1, a0, 1
+; RV32I-NEXT:    bnez a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    li a0, 12
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq_s2:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    andi a0, a0, 1
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 0, a2, 12
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cond_trunc = trunc i32 %a to i1
+  %sel = select i1 %cond_trunc, i32 12, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_s3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_s3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    bnez a0, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 25
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    li a0, 12
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq_s3:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    andi a0, a0, 1
+; RV32IXQCICS-NEXT:    li a1, 25
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 0, a1, 12
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cond_trunc = trunc i32 %a to i1
+  %sel = select i1 %cond_trunc, i32 12, i32 25
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    beq a0, a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selecteqi a0, 11, a2, a3
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, 11
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    beq a0, a1, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq_c:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selecteqi a0, 11, a2, a3
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 11, %a
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    bne a0, a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ne:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectnei a0, 11, a2, a3
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, 11
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    bne a0, a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a2, a3
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ne_c:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectnei a0, 11, a2, a3
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 11, %a
+  %sel = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a0, a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eqi:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieq a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a0, a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eqi_c:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectine a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a0, a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_nei:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectine a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, %b
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a0, a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_nei_c:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieq a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, %b
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ieqi(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ieqi:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    beq a0, a1, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ieqi:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, 12
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ieqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ieqi_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    beq a0, a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ieqi_c1:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 12, %a
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ieqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ieqi_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    bne a0, a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ieqi_c2:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectinei a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, 12
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ieqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ieqi_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    bne a0, a1, .LBB14_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ieqi_c3:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectinei a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 12, %a
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_inei(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_inei:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    bne a0, a1, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_inei:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectinei a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, 12
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_inei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_inei_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    bne a0, a1, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_inei_c1:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectinei a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 12, %a
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_inei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_inei_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    beq a0, a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_inei_c2:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, 12
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_inei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_inei_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 12
+; RV32I-NEXT:    beq a0, a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_inei_c3:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieqi a0, 12, a2, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 12, %a
+  %sel = select i1 %cmp, i32 11, i32 %x
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqii(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqii:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a0, a1, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB19_2:
+; RV32I-NEXT:    li a0, 13
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eqii:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectiieq a0, a1, 13, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %sel = select i1 %cmp, i32 13, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_neii(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_neii:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a0, a1, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB20_2:
+; RV32I-NEXT:    li a0, 13
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_neii:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectiine a0, a1, 13, 11
+; RV32IXQCICS-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %a, %b
+  %sel = select i1 %cmp, i32 13, i32 11
+  ret i32 %sel
+}
+
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll
index 9194d7842a6d3..9772c8311bfbc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll
@@ -6,8 +6,7 @@ define arm_aapcs_vfpcc <4 x float> @arm_max_no_idx_f32_mve(ptr %pSrc, i32 %block
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    subs r2, r1, #4
-; CHECK-NEXT:    movw r3, #0
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    movt r3, #65408
 ; CHECK-NEXT:    vdup.32 q0, r3
 ; CHECK-NEXT:    dlstp.32 lr, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index 6b5b6b2b1b677..573a9420b5278 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -5,7 +5,6 @@ define void @arm_cmplx_mag_squared_q15_mve(ptr %pSrc, ptr %pDst, i32 %blockSize)
 ; CHECK-LABEL: arm_cmplx_mag_squared_q15_mve:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    subs.w r3, r2, #8
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 1feb5feb7a9ee..7190e162eb010 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -107,11 +107,9 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) {
 ; CHECK-NEXT:    v128.bitselect
 ; CHECK-NEXT:    local.tee 0
 ; CHECK-NEXT:    v128.const 0, 0
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64x2.gt_s
-; CHECK-NEXT:    v128.bitselect
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK-NEXT:    # fallthrough-return
@@ -1558,11 +1556,9 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK-NEXT:    v128.bitselect
 ; CHECK-NEXT:    local.tee 0
 ; CHECK-NEXT:    v128.const 0, 0
-; CHECK-NEXT:    local.tee 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i64x2.gt_s
-; CHECK-NEXT:    v128.bitselect
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK-NEXT:    # fallthrough-return
diff --git a/llvm/test/CodeGen/WebAssembly/simd-select.ll b/llvm/test/CodeGen/WebAssembly/simd-select.ll
index 715e73e6c18f2..bb06445b2dcf0 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-select.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-select.ll
@@ -552,3 +552,73 @@ define <2 x double> @select_eq_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
   %res = select i1 %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
+
+define <4 x i32> @select_splat_first_zero_and_icmp(<4 x i32> %x) {
+; CHECK-LABEL: select_splat_first_zero_and_icmp:
+; CHECK:         .functype select_splat_first_zero_and_icmp (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 2139095040, 2139095040, 2139095040, 2139095040
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    v128.const 0, 0, 0, 0
+; CHECK-NEXT:    i32x4.ne
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    # fallthrough-return
+  %a = and <4 x i32> %x, splat (i32 2139095040)
+  %c = icmp eq <4 x i32> %a, zeroinitializer
+  %res = select <4 x i1> %c, <4 x i32> zeroinitializer, <4 x i32> %x
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_splat_second_zero_and_icmp(<4 x i32> %x) {
+; CHECK-LABEL: select_splat_second_zero_and_icmp:
+; CHECK:         .functype select_splat_second_zero_and_icmp (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 2139095040, 2139095040, 2139095040, 2139095040
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    v128.const 0, 0, 0, 0
+; CHECK-NEXT:    i32x4.eq
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    # fallthrough-return
+  %a = and <4 x i32> %x, splat (i32 2139095040)
+  %c = icmp eq <4 x i32> %a, zeroinitializer
+  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_splat_first_zero_cond_input(<4 x i1> %c, <4 x i32> %x) {
+; CHECK-LABEL: select_splat_first_zero_cond_input:
+; CHECK:         .functype select_splat_first_zero_cond_input (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 31
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 31
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    v128.bitselect
+; CHECK-NEXT:    # fallthrough-return
+  %res = select <4 x i1> %c, <4 x i32> zeroinitializer, <4 x i32> %x
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @select_splat_second_zero_cond_input(<4 x i1> %c, <4 x i32> %x) {
+; CHECK-LABEL: select_splat_second_zero_cond_input:
+; CHECK:         .functype select_splat_second_zero_cond_input (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 31
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 31
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    # fallthrough-return
+  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll b/llvm/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
index 2cf09a936f592..3ad6492978438 100644
--- a/llvm/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
+++ b/llvm/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
@@ -8,9 +8,8 @@ entry:
 	ret x86_fp80 %tmp2
         
 ; CHECK-LABEL: foo:
-; CHECK: fldt 16(%esp)
+; CHECK: fldt 4(%esp)
 ; CHECK-NEXT: fsqrt
-; CHECK-NEXT: addl $12, %esp
 ; CHECK-NEXT: ret
 }
 
@@ -21,11 +20,10 @@ entry:
 	%tmp2 = call x86_fp80 @llvm.powi.f80.i32( x86_fp80 %x, i32 3 )
 	ret x86_fp80 %tmp2
 ; CHECK-LABEL: bar:
-; CHECK: fldt 16(%esp)
+; CHECK: fldt 4(%esp)
 ; CHECK-NEXT: fld	%st(0)
 ; CHECK-NEXT: fmul	%st(1)
 ; CHECK-NEXT: fmulp
-; CHECK-NEXT: addl $12, %esp
 ; CHECK-NEXT: ret
 }
 
diff --git a/llvm/test/CodeGen/X86/coalesce-commutative-implicit-def.mir b/llvm/test/CodeGen/X86/coalesce-commutative-implicit-def.mir
index fe1235fe94f85..1f38430f631cc 100644
--- a/llvm/test/CodeGen/X86/coalesce-commutative-implicit-def.mir
+++ b/llvm/test/CodeGen/X86/coalesce-commutative-implicit-def.mir
@@ -35,3 +35,24 @@ body: |
     %0:gr64_with_sub_8bit = COPY %1:gr64_with_sub_8bit
     RET 0, implicit %0
 ...
+# Commuting instruction with 3 ops is handled correctly.
+---
+name: commuting_3_ops
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $ymm0, $ymm1
+
+    ; CHECK-LABEL: name: commuting_3_ops
+    ; CHECK: liveins: $ymm0, $ymm1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vr256 = COPY $ymm1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr256 = COPY $ymm0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr256 = contract nofpexcept VFMADD213PSYr [[COPY1]], [[COPY]], [[COPY]], implicit $mxcsr
+    ; CHECK-NEXT: RET 0, implicit [[COPY1]]
+    %0:vr256 = COPY $ymm1
+    %1:vr256 = COPY $ymm0
+    %0:vr256 = contract nofpexcept VFMADD231PSYr %0:vr256, %0:vr256, %1:vr256, implicit $mxcsr
+    %1:vr256 = COPY %0:vr256
+    RET 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/X86/flt-rounds.ll b/llvm/test/CodeGen/X86/flt-rounds.ll
index a5908978a5438..1d7a8d8456c27 100644
--- a/llvm/test/CodeGen/X86/flt-rounds.ll
+++ b/llvm/test/CodeGen/X86/flt-rounds.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s --check-prefix=X86
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc -mtriple=i686-unknown-linux-gnu   -global-isel=1 -global-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -global-isel=1 -global-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 declare i32 @llvm.get.rounding()
 
@@ -37,139 +39,309 @@ define i32 @test_flt_rounds() nounwind {
 
 ; Make sure we preserve order with fesetround.
 define i32 @multiple_flt_rounds() nounwind {
-; X86-LABEL: multiple_flt_rounds:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
-; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
-; X86-NEXT:    calll fesetround
-; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $9, %ecx
-; X86-NEXT:    andb $6, %cl
-; X86-NEXT:    movl $45, %esi
-; X86-NEXT:    movl $45, %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl $3, %eax
-; X86-NEXT:    setne %bl
-; X86-NEXT:    movl $0, (%esp)
-; X86-NEXT:    calll fesetround
-; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $9, %ecx
-; X86-NEXT:    andb $6, %cl
-; X86-NEXT:    movl $45, %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    je .LBB1_2
-; X86-NEXT:  # %bb.1: # %entry
-; X86-NEXT:    incl %ebx
-; X86-NEXT:  .LBB1_2: # %entry
-; X86-NEXT:    movl $3072, (%esp) # imm = 0xC00
-; X86-NEXT:    calll fesetround
-; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $9, %ecx
-; X86-NEXT:    andb $6, %cl
-; X86-NEXT:    movl $45, %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    sbbl $-1, %ebx
-; X86-NEXT:    movl $2048, (%esp) # imm = 0x800
-; X86-NEXT:    calll fesetround
-; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $9, %ecx
-; X86-NEXT:    andb $6, %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl $2, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %ecx, %ebx
-; X86-NEXT:    setne %al
-; X86-NEXT:    addl $20, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    retl
+; SDAG-X86-LABEL: multiple_flt_rounds:
+; SDAG-X86:       # %bb.0: # %entry
+; SDAG-X86-NEXT:    pushl %ebx
+; SDAG-X86-NEXT:    pushl %esi
+; SDAG-X86-NEXT:    subl $20, %esp
+; SDAG-X86-NEXT:    movl $1024, (%esp) # imm = 0x400
+; SDAG-X86-NEXT:    calll fesetround
+; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    shrl $9, %ecx
+; SDAG-X86-NEXT:    andb $6, %cl
+; SDAG-X86-NEXT:    movl $45, %esi
+; SDAG-X86-NEXT:    movl $45, %eax
+; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X86-NEXT:    shrl %cl, %eax
+; SDAG-X86-NEXT:    andl $3, %eax
+; SDAG-X86-NEXT:    xorl %ebx, %ebx
+; SDAG-X86-NEXT:    cmpl $3, %eax
+; SDAG-X86-NEXT:    setne %bl
+; SDAG-X86-NEXT:    movl $0, (%esp)
+; SDAG-X86-NEXT:    calll fesetround
+; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    shrl $9, %ecx
+; SDAG-X86-NEXT:    andb $6, %cl
+; SDAG-X86-NEXT:    movl $45, %eax
+; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X86-NEXT:    shrl %cl, %eax
+; SDAG-X86-NEXT:    andl $3, %eax
+; SDAG-X86-NEXT:    cmpl $1, %eax
+; SDAG-X86-NEXT:    je .LBB1_2
+; SDAG-X86-NEXT:  # %bb.1: # %entry
+; SDAG-X86-NEXT:    incl %ebx
+; SDAG-X86-NEXT:  .LBB1_2: # %entry
+; SDAG-X86-NEXT:    movl $3072, (%esp) # imm = 0xC00
+; SDAG-X86-NEXT:    calll fesetround
+; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    shrl $9, %ecx
+; SDAG-X86-NEXT:    andb $6, %cl
+; SDAG-X86-NEXT:    movl $45, %eax
+; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X86-NEXT:    shrl %cl, %eax
+; SDAG-X86-NEXT:    andl $3, %eax
+; SDAG-X86-NEXT:    cmpl $1, %eax
+; SDAG-X86-NEXT:    sbbl $-1, %ebx
+; SDAG-X86-NEXT:    movl $2048, (%esp) # imm = 0x800
+; SDAG-X86-NEXT:    calll fesetround
+; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    shrl $9, %ecx
+; SDAG-X86-NEXT:    andb $6, %cl
+; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X86-NEXT:    shrl %cl, %esi
+; SDAG-X86-NEXT:    andl $3, %esi
+; SDAG-X86-NEXT:    xorl %ecx, %ecx
+; SDAG-X86-NEXT:    cmpl $2, %esi
+; SDAG-X86-NEXT:    setne %cl
+; SDAG-X86-NEXT:    negl %ecx
+; SDAG-X86-NEXT:    xorl %eax, %eax
+; SDAG-X86-NEXT:    cmpl %ecx, %ebx
+; SDAG-X86-NEXT:    setne %al
+; SDAG-X86-NEXT:    addl $20, %esp
+; SDAG-X86-NEXT:    popl %esi
+; SDAG-X86-NEXT:    popl %ebx
+; SDAG-X86-NEXT:    retl
 ;
-; X64-LABEL: multiple_flt_rounds:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movl $1024, %edi # imm = 0x400
-; X64-NEXT:    callq fesetround
-; X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    shrl $9, %ecx
-; X64-NEXT:    andb $6, %cl
-; X64-NEXT:    movl $45, %ebx
-; X64-NEXT:    movl $45, %eax
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    xorl %r14d, %r14d
-; X64-NEXT:    cmpl $3, %eax
-; X64-NEXT:    setne %r14b
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    callq fesetround
-; X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    shrl $9, %ecx
-; X64-NEXT:    andb $6, %cl
-; X64-NEXT:    movl $45, %eax
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    leal 1(%r14), %ebp
-; X64-NEXT:    cmpl $1, %eax
-; X64-NEXT:    cmovel %r14d, %ebp
-; X64-NEXT:    movl $3072, %edi # imm = 0xC00
-; X64-NEXT:    callq fesetround
-; X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    shrl $9, %ecx
-; X64-NEXT:    andb $6, %cl
-; X64-NEXT:    movl $45, %eax
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    cmpl $1, %eax
-; X64-NEXT:    sbbl $-1, %ebp
-; X64-NEXT:    movl $2048, %edi # imm = 0x800
-; X64-NEXT:    callq fesetround
-; X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
-; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    shrl $9, %ecx
-; X64-NEXT:    andb $6, %cl
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %ebx
-; X64-NEXT:    andl $3, %ebx
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    cmpl $2, %ebx
-; X64-NEXT:    setne %cl
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %ecx, %ebp
-; X64-NEXT:    setne %al
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %rbp
-; X64-NEXT:    retq
+; SDAG-X64-LABEL: multiple_flt_rounds:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    pushq %rbp
+; SDAG-X64-NEXT:    pushq %r14
+; SDAG-X64-NEXT:    pushq %rbx
+; SDAG-X64-NEXT:    subq $16, %rsp
+; SDAG-X64-NEXT:    movl $1024, %edi # imm = 0x400
+; SDAG-X64-NEXT:    callq fesetround
+; SDAG-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; SDAG-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; SDAG-X64-NEXT:    shrl $9, %ecx
+; SDAG-X64-NEXT:    andb $6, %cl
+; SDAG-X64-NEXT:    movl $45, %ebx
+; SDAG-X64-NEXT:    movl $45, %eax
+; SDAG-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X64-NEXT:    shrl %cl, %eax
+; SDAG-X64-NEXT:    andl $3, %eax
+; SDAG-X64-NEXT:    xorl %r14d, %r14d
+; SDAG-X64-NEXT:    cmpl $3, %eax
+; SDAG-X64-NEXT:    setne %r14b
+; SDAG-X64-NEXT:    xorl %edi, %edi
+; SDAG-X64-NEXT:    callq fesetround
+; SDAG-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; SDAG-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; SDAG-X64-NEXT:    shrl $9, %ecx
+; SDAG-X64-NEXT:    andb $6, %cl
+; SDAG-X64-NEXT:    movl $45, %eax
+; SDAG-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X64-NEXT:    shrl %cl, %eax
+; SDAG-X64-NEXT:    andl $3, %eax
+; SDAG-X64-NEXT:    leal 1(%r14), %ebp
+; SDAG-X64-NEXT:    cmpl $1, %eax
+; SDAG-X64-NEXT:    cmovel %r14d, %ebp
+; SDAG-X64-NEXT:    movl $3072, %edi # imm = 0xC00
+; SDAG-X64-NEXT:    callq fesetround
+; SDAG-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; SDAG-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; SDAG-X64-NEXT:    shrl $9, %ecx
+; SDAG-X64-NEXT:    andb $6, %cl
+; SDAG-X64-NEXT:    movl $45, %eax
+; SDAG-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X64-NEXT:    shrl %cl, %eax
+; SDAG-X64-NEXT:    andl $3, %eax
+; SDAG-X64-NEXT:    cmpl $1, %eax
+; SDAG-X64-NEXT:    sbbl $-1, %ebp
+; SDAG-X64-NEXT:    movl $2048, %edi # imm = 0x800
+; SDAG-X64-NEXT:    callq fesetround
+; SDAG-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; SDAG-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; SDAG-X64-NEXT:    shrl $9, %ecx
+; SDAG-X64-NEXT:    andb $6, %cl
+; SDAG-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SDAG-X64-NEXT:    shrl %cl, %ebx
+; SDAG-X64-NEXT:    andl $3, %ebx
+; SDAG-X64-NEXT:    xorl %ecx, %ecx
+; SDAG-X64-NEXT:    cmpl $2, %ebx
+; SDAG-X64-NEXT:    setne %cl
+; SDAG-X64-NEXT:    negl %ecx
+; SDAG-X64-NEXT:    xorl %eax, %eax
+; SDAG-X64-NEXT:    cmpl %ecx, %ebp
+; SDAG-X64-NEXT:    setne %al
+; SDAG-X64-NEXT:    addq $16, %rsp
+; SDAG-X64-NEXT:    popq %rbx
+; SDAG-X64-NEXT:    popq %r14
+; SDAG-X64-NEXT:    popq %rbp
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X86-LABEL: multiple_flt_rounds:
+; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-NEXT:    pushl %ebp
+; GISEL-X86-NEXT:    pushl %ebx
+; GISEL-X86-NEXT:    pushl %edi
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    movl $1, %ebp
+; GISEL-X86-NEXT:    movl $1024, (%esp) # imm = 0x400
+; GISEL-X86-NEXT:    calll fesetround
+; GISEL-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    shrl $9, %ecx
+; GISEL-X86-NEXT:    andb $6, %cl
+; GISEL-X86-NEXT:    movl $45, %edi
+; GISEL-X86-NEXT:    movl $45, %eax
+; GISEL-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X86-NEXT:    shrl %cl, %eax
+; GISEL-X86-NEXT:    andl $3, %eax
+; GISEL-X86-NEXT:    xorl %ebx, %ebx
+; GISEL-X86-NEXT:    cmpl $3, %eax
+; GISEL-X86-NEXT:    setne %bl
+; GISEL-X86-NEXT:    andl $1, %ebx
+; GISEL-X86-NEXT:    movl $0, (%esp)
+; GISEL-X86-NEXT:    calll fesetround
+; GISEL-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    shrl $9, %ecx
+; GISEL-X86-NEXT:    andb $6, %cl
+; GISEL-X86-NEXT:    movl $45, %edx
+; GISEL-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X86-NEXT:    shrl %cl, %edx
+; GISEL-X86-NEXT:    andl $3, %edx
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    cmpl $1, %edx
+; GISEL-X86-NEXT:    sete %cl
+; GISEL-X86-NEXT:    testl %ebx, %ebx
+; GISEL-X86-NEXT:    je .LBB1_2
+; GISEL-X86-NEXT:  # %bb.1: # %entry
+; GISEL-X86-NEXT:    movl $2, %ebp
+; GISEL-X86-NEXT:  .LBB1_2: # %entry
+; GISEL-X86-NEXT:    xorl %esi, %esi
+; GISEL-X86-NEXT:    movb %cl, %al
+; GISEL-X86-NEXT:    andl $1, %eax
+; GISEL-X86-NEXT:    je .LBB1_4
+; GISEL-X86-NEXT:  # %bb.3: # %entry
+; GISEL-X86-NEXT:    movl %ebx, %ebp
+; GISEL-X86-NEXT:  .LBB1_4: # %entry
+; GISEL-X86-NEXT:    movl $3072, (%esp) # imm = 0xC00
+; GISEL-X86-NEXT:    calll fesetround
+; GISEL-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    shrl $9, %ecx
+; GISEL-X86-NEXT:    andb $6, %cl
+; GISEL-X86-NEXT:    movl $45, %eax
+; GISEL-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X86-NEXT:    shrl %cl, %eax
+; GISEL-X86-NEXT:    andl $3, %eax
+; GISEL-X86-NEXT:    xorl %ebx, %ebx
+; GISEL-X86-NEXT:    cmpl %esi, %eax
+; GISEL-X86-NEXT:    setne %bl
+; GISEL-X86-NEXT:    andl $1, %ebx
+; GISEL-X86-NEXT:    addl %ebp, %ebx
+; GISEL-X86-NEXT:    movl $2048, (%esp) # imm = 0x800
+; GISEL-X86-NEXT:    calll fesetround
+; GISEL-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    shrl $9, %ecx
+; GISEL-X86-NEXT:    andb $6, %cl
+; GISEL-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X86-NEXT:    shrl %cl, %edi
+; GISEL-X86-NEXT:    andl $3, %edi
+; GISEL-X86-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-NEXT:    movl $2, %eax
+; GISEL-X86-NEXT:    cmpl %eax, %edi
+; GISEL-X86-NEXT:    setne %cl
+; GISEL-X86-NEXT:    shll $31, %ecx
+; GISEL-X86-NEXT:    sarl $31, %ecx
+; GISEL-X86-NEXT:    xorl %eax, %eax
+; GISEL-X86-NEXT:    cmpl %ecx, %ebx
+; GISEL-X86-NEXT:    setne %al
+; GISEL-X86-NEXT:    andl $1, %eax
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    popl %edi
+; GISEL-X86-NEXT:    popl %ebx
+; GISEL-X86-NEXT:    popl %ebp
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: multiple_flt_rounds:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    pushq %rbp
+; GISEL-X64-NEXT:    pushq %r15
+; GISEL-X64-NEXT:    pushq %r14
+; GISEL-X64-NEXT:    pushq %rbx
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    movl $1, %r14d
+; GISEL-X64-NEXT:    movl $2, %ebp
+; GISEL-X64-NEXT:    movl $1024, %edi # imm = 0x400
+; GISEL-X64-NEXT:    callq fesetround
+; GISEL-X64-NEXT:    fnstcw (%rsp)
+; GISEL-X64-NEXT:    movzwl (%rsp), %ecx
+; GISEL-X64-NEXT:    shrl $9, %ecx
+; GISEL-X64-NEXT:    andb $6, %cl
+; GISEL-X64-NEXT:    movl $45, %ebx
+; GISEL-X64-NEXT:    movl $45, %eax
+; GISEL-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X64-NEXT:    shrl %cl, %eax
+; GISEL-X64-NEXT:    andl $3, %eax
+; GISEL-X64-NEXT:    xorl %r15d, %r15d
+; GISEL-X64-NEXT:    cmpl $3, %eax
+; GISEL-X64-NEXT:    setne %r15b
+; GISEL-X64-NEXT:    andl $1, %r15d
+; GISEL-X64-NEXT:    xorl %edi, %edi
+; GISEL-X64-NEXT:    callq fesetround
+; GISEL-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; GISEL-X64-NEXT:    shrl $9, %ecx
+; GISEL-X64-NEXT:    andb $6, %cl
+; GISEL-X64-NEXT:    movl $45, %eax
+; GISEL-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X64-NEXT:    shrl %cl, %eax
+; GISEL-X64-NEXT:    andl $3, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl $1, %eax
+; GISEL-X64-NEXT:    sete %cl
+; GISEL-X64-NEXT:    testl %r15d, %r15d
+; GISEL-X64-NEXT:    cmovel %r14d, %ebp
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovnel %r15d, %ebp
+; GISEL-X64-NEXT:    movl $3072, %edi # imm = 0xC00
+; GISEL-X64-NEXT:    callq fesetround
+; GISEL-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; GISEL-X64-NEXT:    shrl $9, %ecx
+; GISEL-X64-NEXT:    andb $6, %cl
+; GISEL-X64-NEXT:    movl $45, %eax
+; GISEL-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X64-NEXT:    shrl %cl, %eax
+; GISEL-X64-NEXT:    andl $3, %eax
+; GISEL-X64-NEXT:    xorl %r14d, %r14d
+; GISEL-X64-NEXT:    cmpl $0, %eax
+; GISEL-X64-NEXT:    setne %r14b
+; GISEL-X64-NEXT:    andl $1, %r14d
+; GISEL-X64-NEXT:    addl %ebp, %r14d
+; GISEL-X64-NEXT:    movl $2048, %edi # imm = 0x800
+; GISEL-X64-NEXT:    callq fesetround
+; GISEL-X64-NEXT:    fnstcw {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
+; GISEL-X64-NEXT:    shrl $9, %ecx
+; GISEL-X64-NEXT:    andb $6, %cl
+; GISEL-X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; GISEL-X64-NEXT:    shrl %cl, %ebx
+; GISEL-X64-NEXT:    andl $3, %ebx
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl $2, %ebx
+; GISEL-X64-NEXT:    setne %cl
+; GISEL-X64-NEXT:    shll $31, %ecx
+; GISEL-X64-NEXT:    sarl $31, %ecx
+; GISEL-X64-NEXT:    xorl %eax, %eax
+; GISEL-X64-NEXT:    cmpl %ecx, %r14d
+; GISEL-X64-NEXT:    setne %al
+; GISEL-X64-NEXT:    andl $1, %eax
+; GISEL-X64-NEXT:    addq $8, %rsp
+; GISEL-X64-NEXT:    popq %rbx
+; GISEL-X64-NEXT:    popq %r14
+; GISEL-X64-NEXT:    popq %r15
+; GISEL-X64-NEXT:    popq %rbp
+; GISEL-X64-NEXT:    retq
 entry:
   %call = tail call i32 @fesetround(i32 1024)
   %0 = tail call i32 @llvm.get.rounding()
diff --git a/llvm/test/CodeGen/X86/fp128-abi.ll b/llvm/test/CodeGen/X86/fp128-abi.ll
deleted file mode 100644
index 526ed7c72f73f..0000000000000
--- a/llvm/test/CodeGen/X86/fp128-abi.ll
+++ /dev/null
@@ -1,659 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-X64
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-X86
-; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-MSVC64
-; RUN: llc < %s -mtriple=i686-pc-windows-msvc -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-MSVC32
-; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-MINGW
-
-define fp128 @return(ptr %p) {
-; CHECK-X64-LABEL: return:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    movaps (%rdi), %xmm0
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: return:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -12
-; CHECK-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl (%ecx), %edx
-; CHECK-X86-NEXT:    movl 4(%ecx), %esi
-; CHECK-X86-NEXT:    movl 8(%ecx), %edi
-; CHECK-X86-NEXT:    movl 12(%ecx), %ecx
-; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
-; CHECK-X86-NEXT:    movl %edi, 8(%eax)
-; CHECK-X86-NEXT:    movl %esi, 4(%eax)
-; CHECK-X86-NEXT:    movl %edx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-X86-NEXT:    retl $4
-;
-; CHECK-MSVC64-LABEL: return:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: return:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl (%ecx), %edx
-; CHECK-MSVC32-NEXT:    movl 4(%ecx), %esi
-; CHECK-MSVC32-NEXT:    movl 8(%ecx), %edi
-; CHECK-MSVC32-NEXT:    movl 12(%ecx), %ecx
-; CHECK-MSVC32-NEXT:    movl %ecx, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %edi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 4(%eax)
-; CHECK-MSVC32-NEXT:    movl %edx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: return:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    retq
-  %r = load fp128, ptr %p, align 16
-  ret fp128 %r
-}
-
-define fp128 @first_arg(fp128 %x) {
-; CHECK-X64-LABEL: first_arg:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: first_arg:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -12
-; CHECK-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
-; CHECK-X86-NEXT:    movl %edx, 4(%eax)
-; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-X86-NEXT:    retl $4
-;
-; CHECK-MSVC64-LABEL: first_arg:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: first_arg:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
-; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: first_arg:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    retq
-  ret fp128 %x
-}
-
-define fp128 @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, fp128 %x) {
-; CHECK-X64-LABEL: leading_args:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: leading_args:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -12
-; CHECK-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
-; CHECK-X86-NEXT:    movl %edx, 4(%eax)
-; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-X86-NEXT:    retl $4
-;
-; CHECK-MSVC64-LABEL: leading_args:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movaps (%rax), %xmm0
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: leading_args:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
-; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: leading_args:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movaps (%rax), %xmm0
-; CHECK-MINGW-NEXT:    retq
-  ret fp128 %x
-}
-
-define fp128 @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, fp128 %_5, fp128 %x) {
-; CHECK-X64-LABEL: many_leading_args:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    movaps %xmm1, %xmm0
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: many_leading_args:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -12
-; CHECK-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
-; CHECK-X86-NEXT:    movl %edx, 4(%eax)
-; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-X86-NEXT:    retl $4
-;
-; CHECK-MSVC64-LABEL: many_leading_args:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movaps (%rax), %xmm0
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: many_leading_args:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
-; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: many_leading_args:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movaps (%rax), %xmm0
-; CHECK-MINGW-NEXT:    retq
-  ret fp128 %x
-}
-
-define fp128 @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, fp128 %x, i64 %_5) {
-; CHECK-X64-LABEL: trailing_arg:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: trailing_arg:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -12
-; CHECK-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
-; CHECK-X86-NEXT:    movl %edx, 4(%eax)
-; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %edi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-X86-NEXT:    retl $4
-;
-; CHECK-MSVC64-LABEL: trailing_arg:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movaps (%rax), %xmm0
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: trailing_arg:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
-; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: trailing_arg:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movaps (%rax), %xmm0
-; CHECK-MINGW-NEXT:    retq
-  ret fp128 %x
-}
-
-define void @call_first_arg(fp128 %x) nounwind {
-; CHECK-X64-LABEL: call_first_arg:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    pushq %rax
-; CHECK-X64-NEXT:    callq first_arg@PLT
-; CHECK-X64-NEXT:    popq %rax
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: call_first_arg:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl %eax
-; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
-; CHECK-X86-NEXT:    retl
-;
-; CHECK-MSVC64-LABEL: call_first_arg:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    subq $56, %rsp
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; CHECK-MSVC64-NEXT:    callq first_arg
-; CHECK-MSVC64-NEXT:    addq $56, %rsp
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: call_first_arg:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %ebp
-; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
-; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
-; CHECK-MSVC32-NEXT:    popl %ebp
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: call_first_arg:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    subq $56, %rsp
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; CHECK-MINGW-NEXT:    callq first_arg
-; CHECK-MINGW-NEXT:    addq $56, %rsp
-; CHECK-MINGW-NEXT:    retq
-  call i128 @first_arg(fp128 %x)
-  ret void
-}
-
-define void @call_leading_args(fp128 %x) nounwind {
-; CHECK-X64-LABEL: call_leading_args:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    pushq %rax
-; CHECK-X64-NEXT:    xorl %edi, %edi
-; CHECK-X64-NEXT:    xorl %esi, %esi
-; CHECK-X64-NEXT:    xorl %edx, %edx
-; CHECK-X64-NEXT:    xorl %ecx, %ecx
-; CHECK-X64-NEXT:    callq leading_args@PLT
-; CHECK-X64-NEXT:    popq %rax
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: call_leading_args:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
-; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
-; CHECK-X86-NEXT:    retl
-;
-; CHECK-MSVC64-LABEL: call_leading_args:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    subq $72, %rsp
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    xorl %ecx, %ecx
-; CHECK-MSVC64-NEXT:    xorl %edx, %edx
-; CHECK-MSVC64-NEXT:    xorl %r8d, %r8d
-; CHECK-MSVC64-NEXT:    xorl %r9d, %r9d
-; CHECK-MSVC64-NEXT:    callq leading_args
-; CHECK-MSVC64-NEXT:    addq $72, %rsp
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: call_leading_args:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %ebp
-; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
-; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
-; CHECK-MSVC32-NEXT:    popl %ebp
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: call_leading_args:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    subq $72, %rsp
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    xorl %ecx, %ecx
-; CHECK-MINGW-NEXT:    xorl %edx, %edx
-; CHECK-MINGW-NEXT:    xorl %r8d, %r8d
-; CHECK-MINGW-NEXT:    xorl %r9d, %r9d
-; CHECK-MINGW-NEXT:    callq leading_args
-; CHECK-MINGW-NEXT:    addq $72, %rsp
-; CHECK-MINGW-NEXT:    retq
-  call i128 @leading_args(i64 0, i64 0, i64 0, i64 0, fp128 %x)
-  ret void
-}
-
-define void @call_many_leading_args(fp128 %x) nounwind {
-; CHECK-X64-LABEL: call_many_leading_args:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    pushq %rax
-; CHECK-X64-NEXT:    movaps %xmm0, %xmm1
-; CHECK-X64-NEXT:    xorps %xmm0, %xmm0
-; CHECK-X64-NEXT:    xorl %edi, %edi
-; CHECK-X64-NEXT:    xorl %esi, %esi
-; CHECK-X64-NEXT:    xorl %edx, %edx
-; CHECK-X64-NEXT:    xorl %ecx, %ecx
-; CHECK-X64-NEXT:    callq many_leading_args@PLT
-; CHECK-X64-NEXT:    popq %rax
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: call_many_leading_args:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
-; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
-; CHECK-X86-NEXT:    retl
-;
-; CHECK-MSVC64-LABEL: call_many_leading_args:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    subq $88, %rsp
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MSVC64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    xorl %ecx, %ecx
-; CHECK-MSVC64-NEXT:    xorl %edx, %edx
-; CHECK-MSVC64-NEXT:    xorl %r8d, %r8d
-; CHECK-MSVC64-NEXT:    xorl %r9d, %r9d
-; CHECK-MSVC64-NEXT:    callq many_leading_args
-; CHECK-MSVC64-NEXT:    addq $88, %rsp
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: call_many_leading_args:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %ebp
-; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
-; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
-; CHECK-MSVC32-NEXT:    popl %ebp
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: call_many_leading_args:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    subq $88, %rsp
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MINGW-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    xorl %ecx, %ecx
-; CHECK-MINGW-NEXT:    xorl %edx, %edx
-; CHECK-MINGW-NEXT:    xorl %r8d, %r8d
-; CHECK-MINGW-NEXT:    xorl %r9d, %r9d
-; CHECK-MINGW-NEXT:    callq many_leading_args
-; CHECK-MINGW-NEXT:    addq $88, %rsp
-; CHECK-MINGW-NEXT:    retq
-  call i128 @many_leading_args(i64 0, i64 0, i64 0, i64 0, fp128 0xL0, fp128 %x)
-  ret void
-}
-
-define void @call_trailing_arg(fp128 %x) nounwind {
-; CHECK-X64-LABEL: call_trailing_arg:
-; CHECK-X64:       # %bb.0:
-; CHECK-X64-NEXT:    pushq %rax
-; CHECK-X64-NEXT:    xorl %edi, %edi
-; CHECK-X64-NEXT:    xorl %esi, %esi
-; CHECK-X64-NEXT:    xorl %edx, %edx
-; CHECK-X64-NEXT:    xorl %ecx, %ecx
-; CHECK-X64-NEXT:    callq trailing_arg@PLT
-; CHECK-X64-NEXT:    popq %rax
-; CHECK-X64-NEXT:    retq
-;
-; CHECK-X86-LABEL: call_trailing_arg:
-; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
-; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
-; CHECK-X86-NEXT:    retl
-;
-; CHECK-MSVC64-LABEL: call_trailing_arg:
-; CHECK-MSVC64:       # %bb.0:
-; CHECK-MSVC64-NEXT:    subq $72, %rsp
-; CHECK-MSVC64-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-NEXT:    xorl %ecx, %ecx
-; CHECK-MSVC64-NEXT:    xorl %edx, %edx
-; CHECK-MSVC64-NEXT:    xorl %r8d, %r8d
-; CHECK-MSVC64-NEXT:    xorl %r9d, %r9d
-; CHECK-MSVC64-NEXT:    callq trailing_arg
-; CHECK-MSVC64-NEXT:    addq $72, %rsp
-; CHECK-MSVC64-NEXT:    retq
-;
-; CHECK-MSVC32-LABEL: call_trailing_arg:
-; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %ebp
-; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
-; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
-; CHECK-MSVC32-NEXT:    popl %ebp
-; CHECK-MSVC32-NEXT:    retl
-;
-; CHECK-MINGW-LABEL: call_trailing_arg:
-; CHECK-MINGW:       # %bb.0:
-; CHECK-MINGW-NEXT:    subq $72, %rsp
-; CHECK-MINGW-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-NEXT:    xorl %ecx, %ecx
-; CHECK-MINGW-NEXT:    xorl %edx, %edx
-; CHECK-MINGW-NEXT:    xorl %r8d, %r8d
-; CHECK-MINGW-NEXT:    xorl %r9d, %r9d
-; CHECK-MINGW-NEXT:    callq trailing_arg
-; CHECK-MINGW-NEXT:    addq $72, %rsp
-; CHECK-MINGW-NEXT:    retq
-  call i128 @trailing_arg(i64 0, i64 0, i64 0, i64 0, fp128 %x)
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/i128-abi.ll b/llvm/test/CodeGen/X86/i128-abi.ll
deleted file mode 100644
index 264c546b4cae2..0000000000000
--- a/llvm/test/CodeGen/X86/i128-abi.ll
+++ /dev/null
@@ -1,97 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-define i128 @in_reg(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i128 %a4) {
-; CHECK-LABEL: in_reg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %r9, %rdx
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    retq
-  ret i128 %a4
-}
-
-define i128 @on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5) {
-; CHECK-LABEL: on_stack:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq 8(%rsp), %rax
-; CHECK-NEXT:    movq 16(%rsp), %rdx
-; CHECK-NEXT:    retq
-  ret i128 %a5
-}
-
-define i128 @on_stack2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i128 %a6) {
-; CHECK-LABEL: on_stack2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq 24(%rsp), %rax
-; CHECK-NEXT:    movq 32(%rsp), %rdx
-; CHECK-NEXT:    retq
-  ret i128 %a6
-}
-
-define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) {
-; CHECK-LABEL: trailing_arg_on_stack:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %r9, %rax
-; CHECK-NEXT:    retq
-  ret i64 %a6
-}
-
-define void @call_in_reg(i128 %x) nounwind {
-; CHECK-LABEL: call_in_reg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movq %rsi, %r9
-; CHECK-NEXT:    movq %rdi, %r8
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    movl $2, %edx
-; CHECK-NEXT:    movl $3, %ecx
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    callq in_reg@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  call i128 @in_reg(i64 0, i64 1, i64 2, i64 3, i128 %x)
-  ret void
-}
-
-define void @call_on_stack(i128 %x) nounwind {
-; CHECK-LABEL: call_on_stack:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %r9
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    movl $2, %edx
-; CHECK-NEXT:    movl $3, %ecx
-; CHECK-NEXT:    movl $4, %r8d
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    pushq %r9
-; CHECK-NEXT:    callq on_stack@PLT
-; CHECK-NEXT:    addq $16, %rsp
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  call i128 @on_stack(i64 0, i64 1, i64 2, i64 3, i64 4, i128 %x)
-  ret void
-}
-
-define void @call_trailing_arg_on_stack(i128 %x, i64 %y) nounwind {
-; CHECK-LABEL: call_trailing_arg_on_stack:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movq %rdx, %r9
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %r10
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    movl $2, %edx
-; CHECK-NEXT:    movl $3, %ecx
-; CHECK-NEXT:    movl $4, %r8d
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    pushq %r10
-; CHECK-NEXT:    callq trailing_arg_on_stack@PLT
-; CHECK-NEXT:    addq $16, %rsp
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  call i128 @trailing_arg_on_stack(i64 0, i64 1, i64 2, i64 3, i64 4, i128 %x, i64 %y)
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
new file mode 100644
index 0000000000000..be8f7923b8f98
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -0,0 +1,873 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; Combined ABI tests for fp128 and i128
+
+; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-X64-F128
+; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-X64-I128
+; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=x86_64-pc-windows-msvc -verify-machineinstrs   | FileCheck %s --check-prefix=CHECK-MSVC64-F128
+; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=x86_64-pc-windows-msvc -verify-machineinstrs   | FileCheck %s --check-prefix=CHECK-MSVC64-I128
+; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=x86_64-pc-windows-gnu -verify-machineinstrs    | FileCheck %s --check-prefix=CHECK-MINGW-F128
+; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=x86_64-pc-windows-gnu -verify-machineinstrs    | FileCheck %s --check-prefix=CHECK-MINGW-I128
+;
+; Use the same directive for i128 and fp128 on x86-32 since both are passed and returned on the stack.
+; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=i686-unknown-linux-gnu -verify-machineinstrs   | FileCheck %s --check-prefix=CHECK-X86
+; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=i686-unknown-linux-gnu -verify-machineinstrs   | FileCheck %s --check-prefix=CHECK-X86
+; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=i686-pc-windows-msvc -verify-machineinstrs     | FileCheck %s --check-prefix=CHECK-MSVC32
+; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=i686-pc-windows-msvc -verify-machineinstrs     | FileCheck %s --check-prefix=CHECK-MSVC32
+
+define PrimTy @return(ptr %p) nounwind {
+; CHECK-X64-F128-LABEL: return:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    movaps (%rdi), %xmm0
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: return:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq (%rdi), %rax
+; CHECK-X64-I128-NEXT:    movq 8(%rdi), %rdx
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: return:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: return:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq (%rcx), %rax
+; CHECK-MSVC64-I128-NEXT:    movq 8(%rcx), %rdx
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: return:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: return:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq (%rcx), %rax
+; CHECK-MINGW-I128-NEXT:    movq 8(%rcx), %rdx
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: return:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl (%ecx), %edx
+; CHECK-X86-NEXT:    movl 4(%ecx), %esi
+; CHECK-X86-NEXT:    movl 8(%ecx), %edi
+; CHECK-X86-NEXT:    movl 12(%ecx), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
+; CHECK-X86-NEXT:    movl %edi, 8(%eax)
+; CHECK-X86-NEXT:    movl %esi, 4(%eax)
+; CHECK-X86-NEXT:    movl %edx, (%eax)
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl $4
+;
+; CHECK-MSVC32-LABEL: return:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl (%ecx), %edx
+; CHECK-MSVC32-NEXT:    movl 4(%ecx), %esi
+; CHECK-MSVC32-NEXT:    movl 8(%ecx), %edi
+; CHECK-MSVC32-NEXT:    movl 12(%ecx), %ecx
+; CHECK-MSVC32-NEXT:    movl %ecx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %edi, 8(%eax)
+; CHECK-MSVC32-NEXT:    movl %esi, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl %edx, (%eax)
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    retl
+  %r = load PrimTy, ptr %p, align 16
+  ret PrimTy %r
+}
+
+define PrimTy @first_arg(PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: first_arg:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: first_arg:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq %rsi, %rdx
+; CHECK-X64-I128-NEXT:    movq %rdi, %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: first_arg:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: first_arg:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, %rax
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: first_arg:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: first_arg:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq %rcx, %rax
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: first_arg:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    movl %edi, 12(%eax)
+; CHECK-X86-NEXT:    movl %esi, 8(%eax)
+; CHECK-X86-NEXT:    movl %edx, 4(%eax)
+; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl $4
+;
+; CHECK-MSVC32-LABEL: first_arg:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
+; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    retl
+  ret PrimTy %x
+}
+
+define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: leading_args:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: leading_args:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq %r9, %rdx
+; CHECK-X64-I128-NEXT:    movq %r8, %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: leading_args:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: leading_args:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: leading_args:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: leading_args:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: leading_args:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    movl %edi, 12(%eax)
+; CHECK-X86-NEXT:    movl %esi, 8(%eax)
+; CHECK-X86-NEXT:    movl %edx, 4(%eax)
+; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl $4
+;
+; CHECK-MSVC32-LABEL: leading_args:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
+; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    retl
+  ret PrimTy %x
+}
+
+define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy %_5, PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: many_leading_args:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    movaps %xmm1, %xmm0
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: many_leading_args:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: many_leading_args:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: many_leading_args:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: many_leading_args:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: many_leading_args:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: many_leading_args:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    movl %edi, 12(%eax)
+; CHECK-X86-NEXT:    movl %esi, 8(%eax)
+; CHECK-X86-NEXT:    movl %edx, 4(%eax)
+; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl $4
+;
+; CHECK-MSVC32-LABEL: many_leading_args:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
+; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    retl
+  ret PrimTy %x
+}
+
+define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy %x, i64 %_5) nounwind {
+; CHECK-X64-F128-LABEL: trailing_arg:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: trailing_arg:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: trailing_arg:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: trailing_arg:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: trailing_arg:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: trailing_arg:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: trailing_arg:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    movl %edi, 12(%eax)
+; CHECK-X86-NEXT:    movl %esi, 8(%eax)
+; CHECK-X86-NEXT:    movl %edx, 4(%eax)
+; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl $4
+;
+; CHECK-MSVC32-LABEL: trailing_arg:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
+; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    retl
+  ret PrimTy %x
+}
+
+define void @call_first_arg(PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: call_first_arg:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    pushq %rax
+; CHECK-X64-F128-NEXT:    callq first_arg@PLT
+; CHECK-X64-F128-NEXT:    popq %rax
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: call_first_arg:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    pushq %rax
+; CHECK-X64-I128-NEXT:    callq first_arg@PLT
+; CHECK-X64-I128-NEXT:    popq %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: call_first_arg:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    subq $56, %rsp
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    callq first_arg
+; CHECK-MSVC64-F128-NEXT:    addq $56, %rsp
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: call_first_arg:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    subq $40, %rsp
+; CHECK-MSVC64-I128-NEXT:    callq first_arg
+; CHECK-MSVC64-I128-NEXT:    addq $40, %rsp
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: call_first_arg:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    subq $56, %rsp
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    callq first_arg
+; CHECK-MINGW-F128-NEXT:    addq $56, %rsp
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: call_first_arg:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    subq $40, %rsp
+; CHECK-MINGW-I128-NEXT:    callq first_arg
+; CHECK-MINGW-I128-NEXT:    addq $40, %rsp
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: call_first_arg:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    subl $40, %esp
+; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    calll first_arg@PLT
+; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: call_first_arg:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $32, %esp
+; CHECK-MSVC32-NEXT:    movl %esp, %eax
+; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
+; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    calll _first_arg
+; CHECK-MSVC32-NEXT:    addl $20, %esp
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  call PrimTy @first_arg(PrimTy %x)
+  ret void
+}
+
+define void @call_leading_args(PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: call_leading_args:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    pushq %rax
+; CHECK-X64-F128-NEXT:    xorl %edi, %edi
+; CHECK-X64-F128-NEXT:    xorl %esi, %esi
+; CHECK-X64-F128-NEXT:    xorl %edx, %edx
+; CHECK-X64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-F128-NEXT:    callq leading_args@PLT
+; CHECK-X64-F128-NEXT:    popq %rax
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: call_leading_args:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    pushq %rax
+; CHECK-X64-I128-NEXT:    movq %rsi, %r9
+; CHECK-X64-I128-NEXT:    movq %rdi, %r8
+; CHECK-X64-I128-NEXT:    xorl %edi, %edi
+; CHECK-X64-I128-NEXT:    xorl %esi, %esi
+; CHECK-X64-I128-NEXT:    xorl %edx, %edx
+; CHECK-X64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-I128-NEXT:    callq leading_args@PLT
+; CHECK-X64-I128-NEXT:    popq %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: call_leading_args:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-F128-NEXT:    callq leading_args
+; CHECK-MSVC64-F128-NEXT:    addq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: call_leading_args:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    subq $56, %rsp
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-I128-NEXT:    callq leading_args
+; CHECK-MSVC64-I128-NEXT:    addq $56, %rsp
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: call_leading_args:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-F128-NEXT:    callq leading_args
+; CHECK-MINGW-F128-NEXT:    addq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: call_leading_args:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    subq $56, %rsp
+; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-I128-NEXT:    callq leading_args
+; CHECK-MINGW-I128-NEXT:    addq $56, %rsp
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: call_leading_args:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    subl $40, %esp
+; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    calll leading_args@PLT
+; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: call_leading_args:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $32, %esp
+; CHECK-MSVC32-NEXT:    movl %esp, %eax
+; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    calll _leading_args
+; CHECK-MSVC32-NEXT:    addl $52, %esp
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
+  ret void
+}
+
+define void @call_many_leading_args(PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: call_many_leading_args:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    pushq %rax
+; CHECK-X64-F128-NEXT:    movaps %xmm0, %xmm1
+; CHECK-X64-F128-NEXT:    xorps %xmm0, %xmm0
+; CHECK-X64-F128-NEXT:    xorl %edi, %edi
+; CHECK-X64-F128-NEXT:    xorl %esi, %esi
+; CHECK-X64-F128-NEXT:    xorl %edx, %edx
+; CHECK-X64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-F128-NEXT:    callq many_leading_args@PLT
+; CHECK-X64-F128-NEXT:    popq %rax
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: call_many_leading_args:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    pushq %rax
+; CHECK-X64-I128-NEXT:    movq %rsi, %rax
+; CHECK-X64-I128-NEXT:    movq %rdi, %r10
+; CHECK-X64-I128-NEXT:    xorl %edi, %edi
+; CHECK-X64-I128-NEXT:    xorl %esi, %esi
+; CHECK-X64-I128-NEXT:    xorl %edx, %edx
+; CHECK-X64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-X64-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-X64-I128-NEXT:    pushq %rax
+; CHECK-X64-I128-NEXT:    pushq %r10
+; CHECK-X64-I128-NEXT:    callq many_leading_args@PLT
+; CHECK-X64-I128-NEXT:    addq $16, %rsp
+; CHECK-X64-I128-NEXT:    popq %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: call_many_leading_args:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    subq $88, %rsp
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    xorps %xmm1, %xmm1
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-F128-NEXT:    callq many_leading_args
+; CHECK-MSVC64-F128-NEXT:    addq $88, %rsp
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: call_many_leading_args:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    subq $72, %rsp
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-I128-NEXT:    callq many_leading_args
+; CHECK-MSVC64-I128-NEXT:    addq $72, %rsp
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: call_many_leading_args:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    subq $88, %rsp
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    xorps %xmm1, %xmm1
+; CHECK-MINGW-F128-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-F128-NEXT:    callq many_leading_args
+; CHECK-MINGW-F128-NEXT:    addq $88, %rsp
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: call_many_leading_args:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    subq $72, %rsp
+; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-I128-NEXT:    callq many_leading_args
+; CHECK-MINGW-I128-NEXT:    addq $72, %rsp
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: call_many_leading_args:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    subl $40, %esp
+; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    calll many_leading_args@PLT
+; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: call_many_leading_args:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $32, %esp
+; CHECK-MSVC32-NEXT:    movl %esp, %eax
+; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    calll _many_leading_args
+; CHECK-MSVC32-NEXT:    addl $68, %esp
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
+  ret void
+}
+
+define void @call_trailing_arg(PrimTy %x) nounwind {
+; CHECK-X64-F128-LABEL: call_trailing_arg:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    pushq %rax
+; CHECK-X64-F128-NEXT:    xorl %edi, %edi
+; CHECK-X64-F128-NEXT:    xorl %esi, %esi
+; CHECK-X64-F128-NEXT:    xorl %edx, %edx
+; CHECK-X64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-F128-NEXT:    callq trailing_arg@PLT
+; CHECK-X64-F128-NEXT:    popq %rax
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: call_trailing_arg:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    pushq %rax
+; CHECK-X64-I128-NEXT:    movq %rsi, %r9
+; CHECK-X64-I128-NEXT:    movq %rdi, %r8
+; CHECK-X64-I128-NEXT:    xorl %edi, %edi
+; CHECK-X64-I128-NEXT:    xorl %esi, %esi
+; CHECK-X64-I128-NEXT:    xorl %edx, %edx
+; CHECK-X64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-X64-I128-NEXT:    callq trailing_arg@PLT
+; CHECK-X64-I128-NEXT:    popq %rax
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: call_trailing_arg:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-F128-NEXT:    callq trailing_arg
+; CHECK-MSVC64-F128-NEXT:    addq $72, %rsp
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: call_trailing_arg:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    subq $56, %rsp
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
+; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MSVC64-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MSVC64-I128-NEXT:    callq trailing_arg
+; CHECK-MSVC64-I128-NEXT:    addq $56, %rsp
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: call_trailing_arg:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-F128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-F128-NEXT:    callq trailing_arg
+; CHECK-MINGW-F128-NEXT:    addq $72, %rsp
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: call_trailing_arg:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    subq $56, %rsp
+; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
+; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
+; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
+; CHECK-MINGW-I128-NEXT:    xorl %r9d, %r9d
+; CHECK-MINGW-I128-NEXT:    callq trailing_arg
+; CHECK-MINGW-I128-NEXT:    addq $56, %rsp
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: call_trailing_arg:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    subl $40, %esp
+; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl $0
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    calll trailing_arg@PLT
+; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: call_trailing_arg:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $32, %esp
+; CHECK-MSVC32-NEXT:    movl %esp, %eax
+; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
+; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl $0
+; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    calll _trailing_arg
+; CHECK-MSVC32-NEXT:    addl $52, %esp
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
index 2d8ad6d645bc0..af188ef3a2cf8 100644
--- a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -29,12 +29,10 @@ define double @test2() nounwind {
 define void @test3(x86_fp80 %X) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    frob
 ; CHECK-NEXT:    ## InlineAsm End
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( x86_fp80 %X)
   ret void
@@ -248,14 +246,12 @@ entry:
 define void @fist1(x86_fp80 %x, ptr %p) nounwind ssp {
 ; CHECK-LABEL: fist1:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    fistl (%eax)
 ; CHECK-NEXT:    ## InlineAsm End
 ; CHECK-NEXT:    fstp %st(0)
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "fistl $1", "{st},*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, ptr elementtype(i32) %p) nounwind
@@ -273,13 +269,11 @@ entry:
 define x86_fp80 @fist2(x86_fp80 %x, ptr %p) nounwind ssp {
 ; CHECK-LABEL: fist2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    fistl (%eax)
 ; CHECK-NEXT:    ## InlineAsm End
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
   %0 = tail call x86_fp80 asm "fistl $2", "=&{st},0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, ptr elementtype(i32) %p) nounwind
@@ -294,7 +288,6 @@ entry:
 define void @fucomp1(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
 ; CHECK-LABEL: fucomp1:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fxch %st(1)
@@ -302,7 +295,6 @@ define void @fucomp1(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
 ; CHECK-NEXT:    fucomp %st(1)
 ; CHECK-NEXT:    ## InlineAsm End
 ; CHECK-NEXT:    fstp %st(0)
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "fucomp $1", "{st},f,~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
@@ -322,7 +314,6 @@ entry:
 define void @fucomp2(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
 ; CHECK-LABEL: fucomp2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fxch %st(1)
@@ -330,7 +321,6 @@ define void @fucomp2(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
 ; CHECK-NEXT:    fucomp %st(1)
 ; CHECK-NEXT:    ## InlineAsm End
 ; CHECK-NEXT:    fstp %st(0)
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "fucomp $1", "{st},{st(1)},~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
@@ -340,14 +330,12 @@ entry:
 define void @fucomp3(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
 ; CHECK-LABEL: fucomp3:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    fucompp %st(1)
 ; CHECK-NEXT:    ## InlineAsm End
-; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "fucompp $1", "{st},{st(1)},~{st},~{st(1)},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
diff --git a/llvm/test/CodeGen/X86/isel-fcmp-x87.ll b/llvm/test/CodeGen/X86/isel-fcmp-x87.ll
index 8c2a53082649a..84c9750bc326d 100644
--- a/llvm/test/CodeGen/X86/isel-fcmp-x87.ll
+++ b/llvm/test/CodeGen/X86/isel-fcmp-x87.ll
@@ -33,7 +33,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_oeq:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -43,12 +42,10 @@
 ; X86-NEXT:    setnp %cl
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    andb %cl, %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_oeq:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
@@ -57,7 +54,6 @@
 ; GISEL-X86-NEXT:    sete %cl
 ; GISEL-X86-NEXT:    setnp %al
 ; GISEL-X86-NEXT:    andb %cl, %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp oeq x86_fp80 %x, %y
     ret i1 %1
@@ -85,7 +81,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_ogt:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -93,19 +88,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    seta %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ogt:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    seta %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ogt x86_fp80 %x, %y
     ret i1 %1
@@ -133,7 +125,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_oge:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -141,19 +132,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setae %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_oge:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setae %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp oge x86_fp80 %x, %y
     ret i1 %1
@@ -190,7 +178,6 @@
 ;
 ; SDAG-X86-LABEL: fcmp_x86_fp80_olt:
 ; SDAG-X86:       ## %bb.0:
-; SDAG-X86-NEXT:    subl $12, %esp
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fucompp
@@ -198,12 +185,10 @@
 ; SDAG-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; SDAG-X86-NEXT:    sahf
 ; SDAG-X86-NEXT:    seta %al
-; SDAG-X86-NEXT:    addl $12, %esp
 ; SDAG-X86-NEXT:    retl
 ;
 ; FAST-X86-LABEL: fcmp_x86_fp80_olt:
 ; FAST-X86:       ## %bb.0:
-; FAST-X86-NEXT:    subl $12, %esp
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fxch %st(1)
@@ -212,18 +197,15 @@
 ; FAST-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; FAST-X86-NEXT:    sahf
 ; FAST-X86-NEXT:    seta %al
-; FAST-X86-NEXT:    addl $12, %esp
 ; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_olt:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    seta %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp olt x86_fp80 %x, %y
     ret i1 %1
@@ -260,7 +242,6 @@
 ;
 ; SDAG-X86-LABEL: fcmp_x86_fp80_ole:
 ; SDAG-X86:       ## %bb.0:
-; SDAG-X86-NEXT:    subl $12, %esp
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fucompp
@@ -268,12 +249,10 @@
 ; SDAG-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; SDAG-X86-NEXT:    sahf
 ; SDAG-X86-NEXT:    setae %al
-; SDAG-X86-NEXT:    addl $12, %esp
 ; SDAG-X86-NEXT:    retl
 ;
 ; FAST-X86-LABEL: fcmp_x86_fp80_ole:
 ; FAST-X86:       ## %bb.0:
-; FAST-X86-NEXT:    subl $12, %esp
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fxch %st(1)
@@ -282,18 +261,15 @@
 ; FAST-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; FAST-X86-NEXT:    sahf
 ; FAST-X86-NEXT:    setae %al
-; FAST-X86-NEXT:    addl $12, %esp
 ; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ole:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setae %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ole x86_fp80 %x, %y
     ret i1 %1
@@ -321,7 +297,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_one:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -329,19 +304,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_one:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setne %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp one x86_fp80 %x, %y
     ret i1 %1
@@ -369,7 +341,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_ord:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -377,19 +348,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setnp %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ord:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setnp %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ord x86_fp80 %x, %y
     ret i1 %1
@@ -417,7 +385,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_uno:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -425,19 +392,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setp %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_uno:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setp %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp uno x86_fp80 %x, %y
     ret i1 %1
@@ -465,7 +429,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_ueq:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -473,19 +436,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ueq:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    sete %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ueq x86_fp80 %x, %y
     ret i1 %1
@@ -522,7 +482,6 @@
 ;
 ; SDAG-X86-LABEL: fcmp_x86_fp80_ugt:
 ; SDAG-X86:       ## %bb.0:
-; SDAG-X86-NEXT:    subl $12, %esp
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fucompp
@@ -530,12 +489,10 @@
 ; SDAG-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; SDAG-X86-NEXT:    sahf
 ; SDAG-X86-NEXT:    setb %al
-; SDAG-X86-NEXT:    addl $12, %esp
 ; SDAG-X86-NEXT:    retl
 ;
 ; FAST-X86-LABEL: fcmp_x86_fp80_ugt:
 ; FAST-X86:       ## %bb.0:
-; FAST-X86-NEXT:    subl $12, %esp
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fxch %st(1)
@@ -544,18 +501,15 @@
 ; FAST-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; FAST-X86-NEXT:    sahf
 ; FAST-X86-NEXT:    setb %al
-; FAST-X86-NEXT:    addl $12, %esp
 ; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ugt:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setb %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ugt x86_fp80 %x, %y
     ret i1 %1
@@ -592,7 +546,6 @@
 ;
 ; SDAG-X86-LABEL: fcmp_x86_fp80_uge:
 ; SDAG-X86:       ## %bb.0:
-; SDAG-X86-NEXT:    subl $12, %esp
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fucompp
@@ -600,12 +553,10 @@
 ; SDAG-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; SDAG-X86-NEXT:    sahf
 ; SDAG-X86-NEXT:    setbe %al
-; SDAG-X86-NEXT:    addl $12, %esp
 ; SDAG-X86-NEXT:    retl
 ;
 ; FAST-X86-LABEL: fcmp_x86_fp80_uge:
 ; FAST-X86:       ## %bb.0:
-; FAST-X86-NEXT:    subl $12, %esp
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; FAST-X86-NEXT:    fxch %st(1)
@@ -614,18 +565,15 @@
 ; FAST-X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; FAST-X86-NEXT:    sahf
 ; FAST-X86-NEXT:    setbe %al
-; FAST-X86-NEXT:    addl $12, %esp
 ; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_uge:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setbe %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp uge x86_fp80 %x, %y
     ret i1 %1
@@ -653,7 +601,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_ult:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -661,19 +608,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ult:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setb %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ult x86_fp80 %x, %y
     ret i1 %1
@@ -701,7 +645,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_ule:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -709,19 +652,16 @@
 ; X86-NEXT:    ## kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setbe %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_ule:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
 ; GISEL-X86-NEXT:    fucompi %st(1), %st
 ; GISEL-X86-NEXT:    fstp %st(0)
 ; GISEL-X86-NEXT:    setbe %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp ule x86_fp80 %x, %y
     ret i1 %1
@@ -753,7 +693,6 @@
 ;
 ; X86-LABEL: fcmp_x86_fp80_une:
 ; X86:       ## %bb.0:
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
@@ -763,12 +702,10 @@
 ; X86-NEXT:    setp %cl
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: fcmp_x86_fp80_une:
 ; GISEL-X86:       ## %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fxch %st(1)
@@ -777,7 +714,6 @@
 ; GISEL-X86-NEXT:    setne %cl
 ; GISEL-X86-NEXT:    setp %al
 ; GISEL-X86-NEXT:    orb %cl, %al
-; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
     %1 = fcmp une x86_fp80 %x, %y
     ret i1 %1
diff --git a/llvm/test/CodeGen/X86/kcfi-arity.ll b/llvm/test/CodeGen/X86/kcfi-arity.ll
index 009fa7d2dc0a4..5a19bcd7835ea 100644
--- a/llvm/test/CodeGen/X86/kcfi-arity.ll
+++ b/llvm/test/CodeGen/X86/kcfi-arity.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64-unknown-none -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
 
diff --git a/llvm/test/CodeGen/X86/long-double-abi-align.ll b/llvm/test/CodeGen/X86/long-double-abi-align.ll
index fcb88714f8b82..02d68ada9a8d4 100644
--- a/llvm/test/CodeGen/X86/long-double-abi-align.ll
+++ b/llvm/test/CodeGen/X86/long-double-abi-align.ll
@@ -73,7 +73,7 @@ define void @foo(i32 %0, x86_fp80 %1, i32 %2) nounwind {
 ; DARWIN-LABEL: foo:
 ; DARWIN:       ## %bb.0:
 ; DARWIN-NEXT:    subl $44, %esp
-; DARWIN-NEXT:    fldt 64(%esp)
+; DARWIN-NEXT:    fldt 52(%esp)
 ; DARWIN-NEXT:    fstpt 16(%esp)
 ; DARWIN-NEXT:    leal 48(%esp), %eax
 ; DARWIN-NEXT:    movl %eax, (%esp)
@@ -81,7 +81,7 @@ define void @foo(i32 %0, x86_fp80 %1, i32 %2) nounwind {
 ; DARWIN-NEXT:    leal 16(%esp), %eax
 ; DARWIN-NEXT:    movl %eax, (%esp)
 ; DARWIN-NEXT:    calll _escape
-; DARWIN-NEXT:    leal 80(%esp), %eax
+; DARWIN-NEXT:    leal 68(%esp), %eax
 ; DARWIN-NEXT:    movl %eax, (%esp)
 ; DARWIN-NEXT:    calll _escape
 ; DARWIN-NEXT:    addl $44, %esp
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index 0caa569107c0c..4613c2bcdcaf4 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -225,9 +225,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    pushl %esi
 ; X86-AVX512-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX512-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512-NEXT:    kmovd %eax, %k0
-; X86-AVX512-NEXT:    knotw %k0, %k1
-; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X86-AVX512-NEXT:    kmovd %eax, %k1
+; X86-AVX512-NEXT:    knotw %k1, %k2
+; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX512-NEXT:    vmovd %xmm0, %edx
 ; X86-AVX512-NEXT:    movl $286331152, %ecx # imm = 0x11111110
@@ -247,9 +247,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    addl %edx, %eax
 ; X86-AVX512-NEXT:    vmovd %esi, %xmm1
 ; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; X86-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; X86-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X86-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    popl %esi
 ; X86-AVX512-NEXT:    popl %edi
@@ -258,9 +258,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ;
 ; X64-AVX512-LABEL: produceShuffleVectorForByte:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    kmovd %edi, %k0
-; X64-AVX512-NEXT:    knotw %k0, %k1
-; X64-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X64-AVX512-NEXT:    kmovd %edi, %k1
+; X64-AVX512-NEXT:    knotw %k1, %k2
+; X64-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
 ; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
 ; X64-AVX512-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
@@ -269,9 +269,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-AVX512-NEXT:    vmovq %rax, %xmm0
 ; X64-AVX512-NEXT:    imulq %rcx, %rdx
 ; X64-AVX512-NEXT:    vmovq %rdx, %xmm1
-; X64-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
-; X64-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; X64-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    retq
 entry:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 0421d525890e7..b292a8a9b1d66 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -11436,8 +11436,11 @@ define <16 x i32>@test_int_x86_avx512_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1)
 ; CHECK-LABEL: @test_int_x86_avx512_pabs_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <16 x i32> [[X0:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> splat (i32 -1), <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 false, <16 x i32> [[TMP3]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0]], i1 false)
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
 ;
   %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -11451,12 +11454,15 @@ define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <16 x i32> [[X0:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> splat (i32 -1), <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, <16 x i32> [[TMP13]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0]], i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP14]], <16 x i32> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]]
@@ -11473,8 +11479,11 @@ define <8 x i64>@test_int_x86_avx512_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8
 ; CHECK-LABEL: @test_int_x86_avx512_pabs_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <8 x i64> [[X0:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> splat (i64 -1), <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 false, <8 x i64> [[TMP3]], <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0]], i1 false)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
 ;
   %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
@@ -11488,12 +11497,15 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <8 x i64> [[X0:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> splat (i64 -1), <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, <8 x i64> [[TMP13]], <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0]], i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP14]], <8 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index a41f26a0e3c1c..18441b2d7e253 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -4443,8 +4443,11 @@ define <32 x i16> @test_int_x86_avx512_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1
 ; CHECK-LABEL: @test_int_x86_avx512_pabs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <32 x i16> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <32 x i16> [[X0:%.*]], splat (i16 -32768)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP5]], <32 x i16> splat (i16 -1), <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 false, <32 x i16> [[TMP3]], <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> [[X0]], i1 false)
+; CHECK-NEXT:    store <32 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
 ;
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
@@ -4457,12 +4460,15 @@ define <32 x i16> @test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <32 x i16> [[X0:%.*]], splat (i16 -32768)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> splat (i16 -1), <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, <32 x i16> [[TMP13]], <32 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> [[X0]], i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x i16> [[TMP1]], <32 x i16> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x i16> [[TMP14]], <32 x i16> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <32 x i16> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i16> [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i16> [[TMP10]], <32 x i16> [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP6]], <32 x i16> [[TMP4]], <32 x i16> [[X1]]
@@ -4479,8 +4485,11 @@ define <64 x i8> @test_int_x86_avx512_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1) n
 ; CHECK-LABEL: @test_int_x86_avx512_pabs_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <64 x i8> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[X0:%.*]], splat (i8 -128)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> splat (i8 -1), <64 x i8> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 false, <64 x i8> [[TMP3]], <64 x i8> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> [[X0]], i1 false)
+; CHECK-NEXT:    store <64 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
 ;
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
@@ -4493,12 +4502,15 @@ define <64 x i8> @test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <64 x i8> [[X0:%.*]], splat (i8 -128)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <64 x i1> [[TMP12]], <64 x i8> splat (i8 -1), <64 x i8> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, <64 x i8> [[TMP13]], <64 x i8> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> [[X0]], i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP2]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[X2:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP1]], <64 x i8> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP14]], <64 x i8> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <64 x i8> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP10]], <64 x i8> [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP4]], <64 x i8> [[X1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/abs-vector.ll b/llvm/test/Instrumentation/MemorySanitizer/abs-vector.ll
index fd16dfe54e675..11af676eed7b4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/abs-vector.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/abs-vector.ll
@@ -6,19 +6,22 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_mm256_abs_epi8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; ORIGIN-NEXT:   [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK:         call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <32 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i8> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i8> [[TMP4]] to <4 x i64>
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
-; ORIGIN-NEXT:   store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret <4 x i64> [[TMP6]]
+; ORIGIN-LABEL: @test_mm256_abs_epi8(
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <32 x i8>
+; ORIGIN-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
+; ORIGIN-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[TMP3]], splat (i8 -128)
+; ORIGIN-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 -1), <32 x i8> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 false, <32 x i8> [[TMP5]], <32 x i8> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP7:%.*]] = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP3]], i1 false)
+; ORIGIN-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP6]] to <4 x i64>
+; ORIGIN-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP7]] to <4 x i64>
+; ORIGIN-NEXT:    store <4 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
+; ORIGIN-NEXT:    ret <4 x i64> [[TMP9]]
 ;
 entry:
   %0 = bitcast <4 x i64> %a to <32 x i8>
@@ -28,19 +31,22 @@ entry:
 }
 
 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_mm256_abs_epi16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; ORIGIN-NEXT:   [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK:         call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <16 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP4]] to <4 x i64>
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
-; ORIGIN-NEXT:   store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret <4 x i64> [[TMP6]]
+; ORIGIN-LABEL: @test_mm256_abs_epi16(
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <16 x i16>
+; ORIGIN-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <16 x i16>
+; ORIGIN-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i16> [[TMP3]], splat (i16 -32768)
+; ORIGIN-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[TMP4]], <16 x i16> splat (i16 -1), <16 x i16> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 false, <16 x i16> [[TMP5]], <16 x i16> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP7:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP3]], i1 false)
+; ORIGIN-NEXT:    [[TMP8:%.*]] = bitcast <16 x i16> [[TMP6]] to <4 x i64>
+; ORIGIN-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP7]] to <4 x i64>
+; ORIGIN-NEXT:    store <4 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
+; ORIGIN-NEXT:    ret <4 x i64> [[TMP9]]
 ;
 entry:
   %0 = bitcast <4 x i64> %a to <16 x i16>
@@ -50,19 +56,22 @@ entry:
 }
 
 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_mm256_abs_epi32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; ORIGIN-NEXT:   [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK:         call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP3]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to <4 x i64>
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
-; ORIGIN-NEXT:   store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret <4 x i64> [[TMP6]]
+; ORIGIN-LABEL: @test_mm256_abs_epi32(
+; ORIGIN-NEXT:  entry:
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; ORIGIN-NEXT:    call void @llvm.donothing()
+; ORIGIN-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+; ORIGIN-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
+; ORIGIN-NEXT:    [[TMP4:%.*]] = icmp eq <8 x i32> [[TMP3]], splat (i32 -2147483648)
+; ORIGIN-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> splat (i32 -1), <8 x i32> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 false, <8 x i32> [[TMP5]], <8 x i32> [[TMP2]]
+; ORIGIN-NEXT:    [[TMP7:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP3]], i1 false)
+; ORIGIN-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP6]] to <4 x i64>
+; ORIGIN-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP7]] to <4 x i64>
+; ORIGIN-NEXT:    store <4 x i64> [[TMP8]], ptr @__msan_retval_tls, align 8
+; ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
+; ORIGIN-NEXT:    ret <4 x i64> [[TMP9]]
 ;
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
index 95a9268112920..56c8d7ec07496 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
@@ -28,3 +28,15 @@ s_rfe_i64 s[2:3]
 
 s_rfe_b64 s[2:3]
 // GFX1250: s_rfe_i64 s[2:3]                        ; encoding: [0x02,0x4a,0x80,0xbe]
+
+s_barrier_signal -3
+// GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
+
+s_get_barrier_state s3, -3
+// GFX1250: s_get_barrier_state s3, -3              ; encoding: [0xc3,0x50,0x83,0xbe]
+
+s_get_barrier_state s3, -4
+// GFX1250: s_get_barrier_state s3, -4              ; encoding: [0xc4,0x50,0x83,0xbe]
+
+s_get_barrier_state s3, m0
+// GFX1250: s_get_barrier_state s3, m0              ; encoding: [0x7d,0x50,0x83,0xbe]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 6ebc17468eed6..234c2ed0de793 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -1,6 +1,26 @@
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: -strict-whitespace %s
 
+s_wait_asynccnt 0x1234
+// GFX1250: [0x34,0x12,0xca,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_asynccnt 0xc1d1
+// GFX1250: [0xd1,0xc1,0xca,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_tensorcnt 0x0
+// GFX1250: [0x00,0x00,0xcb,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_tensorcnt 0x1
+// GFX1250: [0x01,0x00,0xcb,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_tensorcnt 0x3
+// GFX1250: [0x03,0x00,0xcb,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_wait_xcnt 0x0
 // GFX1250: [0x00,0x00,0xc5,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
new file mode 100644
index 0000000000000..cc14e4caf851e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -0,0 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200-ERR --implicit-check-not=error: %s
+
+v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], v[254:255], 0x405ec00000000000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], v[254:255], 0x405ec000, v[2:3] ; encoding: [0xfe,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], s[2:3], 0x405ec00012345678, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], s[2:3], lit64(0x405ec00012345678), v[2:3] ; encoding: [0x02,0x04,0x0c,0x46,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], vcc, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], vcc, 0x405ec000, v[2:3] ; encoding: [0x6a,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], exec, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], exec, 0x405ec000, v[2:3] ; encoding: [0x7e,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], null, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], null, 0x405ec000, v[2:3] ; encoding: [0x7c,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], -1, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], -1, 0x405ec000, v[2:3] ; encoding: [0xc1,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], 0.5, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], 0.5, 0x405ec000, v[2:3] ; encoding: [0xf0,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[6:7], src_scc, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[6:7], src_scc, 0x405ec000, v[2:3] ; encoding: [0xfd,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[254:255], 0x405ec000, 0x405ec000, v[254:255]
+// GFX1250: v_fmamk_f64 v[254:255], 0x405ec000, 0x405ec000, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[254:255], 0x405ec00012345678, 0x405ec00012345678, v[254:255]
+// GFX1250: v_fmamk_f64 v[254:255], lit64(0x405ec00012345678), lit64(0x405ec00012345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x47,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[254:255], 123.0, 0x405ec000, v[2:3]
+// GFX1250: v_fmamk_f64 v[254:255], 0x405ec000, 0x405ec000, v[2:3] ; encoding: [0xfe,0x04,0xfc,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], v[2:3], 123.1, v[6:7]
+// GFX1250: v_fmamk_f64 v[4:5], v[2:3], lit64(0x405ec66666666666), v[6:7] ; encoding: [0x02,0x0d,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], 0x405ec66666666666, 123.1, v[6:7]
+// GFX1250: v_fmamk_f64 v[4:5], lit64(0x405ec66666666666), lit64(0x405ec66666666666), v[6:7] ; encoding: [0xfe,0x0c,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], 123.1, 123.1, v[8:9]
+// GFX1250: v_fmamk_f64 v[4:5], lit64(0x405ec66666666666), lit64(0x405ec66666666666), v[8:9] ; encoding: [0xfe,0x10,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], 1.0, 1.0, v[6:7]
+// GFX1250: v_fmamk_f64 v[4:5], 1.0, 0x3ff00000, v[6:7] ; encoding: [0xf2,0x0c,0x08,0x46,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], 1e-320, 1e-320, v[6:7]
+// GFX1250: v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[6:7] ; encoding: [0xfe,0x0c,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], lit64(0x7e8), 1e-320, v[8:9]
+// GFX1250: v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[8:9] ; encoding: [0xfe,0x10,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[8:9]
+// GFX1250: v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[8:9] ; encoding: [0xfe,0x10,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], v[4:5], v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], v[4:5], v[8:9], 0x405ec000 ; encoding: [0x04,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], v[254:255], v[8:9], 0x405ec00000000000
+// GFX1250: v_fmaak_f64 v[6:7], v[254:255], v[8:9], 0x405ec000 ; encoding: [0xfe,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], s[2:3], v[8:9], 0x405ec00012345678
+// GFX1250: v_fmaak_f64 v[6:7], s[2:3], v[8:9], lit64(0x405ec00012345678) ; encoding: [0x02,0x10,0x0c,0x48,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], vcc, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], vcc, v[8:9], 0x405ec000 ; encoding: [0x6a,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], exec, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], exec, v[8:9], 0x405ec000 ; encoding: [0x7e,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], null, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], null, v[8:9], 0x405ec000 ; encoding: [0x7c,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], -1, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], -1, v[8:9], 0x405ec000 ; encoding: [0xc1,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], 0.5, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], 0.5, v[8:9], 0x405ec000 ; encoding: [0xf0,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[6:7], src_scc, v[8:9], 0x405ec000
+// GFX1250: v_fmaak_f64 v[6:7], src_scc, v[8:9], 0x405ec000 ; encoding: [0xfd,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000
+// GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[254:255], 0x405ec00000000000, v[254:255], 0x405ec00000000000
+// GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[254:255], 0x405ec00012345678, v[254:255], 0x405ec00012345678
+// GFX1250: v_fmaak_f64 v[254:255], lit64(0x405ec00012345678), v[254:255], lit64(0x405ec00012345678) ; encoding: [0xfe,0xfc,0xfd,0x49,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[254:255], lit64(0x405ec00012345678), v[254:255], lit(0x405ec00012345678)
+// GFX1250: v_fmaak_f64 v[254:255], lit64(0x405ec00012345678), v[254:255], lit64(0x405ec00012345678) ; encoding: [0xfe,0xfc,0xfd,0x49,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[254:255], 123.0, v[2:3], 0x405ec000
+// GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[2:3], 0x405ec000 ; encoding: [0xfe,0x04,0xfc,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], v[2:3], v[2:3], 123.1
+// GFX1250: v_fmaak_f64 v[4:5], v[2:3], v[2:3], lit64(0x405ec66666666666) ; encoding: [0x02,0x05,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], 0x405ec66666666666, v[6:7], 123.1
+// GFX1250: v_fmaak_f64 v[4:5], lit64(0x405ec66666666666), v[6:7], lit64(0x405ec66666666666) ; encoding: [0xfe,0x0c,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], 123.1, v[8:9], 123.1
+// GFX1250: v_fmaak_f64 v[4:5], lit64(0x405ec66666666666), v[8:9], lit64(0x405ec66666666666) ; encoding: [0xfe,0x10,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], 1.0, v[8:9], 1.0
+// GFX1250: v_fmaak_f64 v[4:5], 1.0, v[8:9], 0x3ff00000 ; encoding: [0xf2,0x10,0x08,0x48,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], 1e-320, v[6:7], 1e-320
+// GFX1250: v_fmaak_f64 v[4:5], lit64(0x7e8), v[6:7], lit64(0x7e8) ; encoding: [0xfe,0x0c,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], lit64(0x7e8), v[8:9], 1e-320
+// GFX1250: v_fmaak_f64 v[4:5], lit64(0x7e8), v[8:9], lit64(0x7e8) ; encoding: [0xfe,0x10,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmaak_f64 v[4:5], lit64(0x7e8), v[8:9], lit64(0x7e8)
+// GFX1250: v_fmaak_f64 v[4:5], lit64(0x7e8), v[8:9], lit64(0x7e8) ; encoding: [0xfe,0x10,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
new file mode 100644
index 0000000000000..b68306d60cf8c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
@@ -0,0 +1,21 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_fmaak_f32 v4, v2, v6, 3 row_share:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_fmaak_f32 v4, v2, v6, 3 row_share:1
+// GFX1250-ERR-NEXT:{{^}}                          ^
+
+v_fmamk_f32 v4, v2, 3, v6 row_share:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_fmamk_f32 v4, v2, 3, v6 row_share:1
+// GFX1250-ERR-NEXT:{{^}}                          ^
+
+v_fmaak_f16 v4, v2, v6, 3 row_share:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_fmaak_f16 v4, v2, v6, 3 row_share:1
+// GFX1250-ERR-NEXT:{{^}}                          ^
+
+v_fmamk_f16 v4, v2, 3, v6 row_share:1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_fmamk_f16 v4, v2, 3, v6 row_share:1
+// GFX1250-ERR-NEXT:{{^}}                          ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd.s
new file mode 100644
index 0000000000000..9c4c57602ecd3
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd.s
@@ -0,0 +1,16276 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x08,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x08,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x08,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x08,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x08,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x08,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x08,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x08,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x08,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x08,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x08,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x08,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x08,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x08,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x08,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x08,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x08,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x20,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x20,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x20,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x20,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x20,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x20,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x20,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x20,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x20,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x20,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x20,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x20,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x20,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x20,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x20,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x20,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x20,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x20,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x12,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x12,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x12,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x12,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x12,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x12,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x12,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x12,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x12,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x12,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x12,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x12,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x12,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x12,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x12,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x12,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x12,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x12,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x02,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x02,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x02,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x02,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x02,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x02,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x02,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x02,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x02,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x02,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x02,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x02,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x02,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x02,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x02,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x02,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x02,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x02,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x00,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x00,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x00,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x00,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x00,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x00,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x00,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x00,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x00,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x00,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x00,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x00,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x00,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x00,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x00,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x00,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x00,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x00,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x05,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x05,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x05,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x05,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x05,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x05,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x05,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x05,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x05,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x05,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x05,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x05,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x05,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x05,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x05,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x04,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x04,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x04,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x22,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x22,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x22,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x22,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x22,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x22,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x22,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x22,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x22,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x22,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x22,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x22,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x22,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x22,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x22,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x22,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x22,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x22,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x14,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x14,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x14,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x14,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x14,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x14,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x14,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x14,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x14,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x14,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x14,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x14,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x14,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x14,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x14,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x14,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x14,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x14,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x16,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x16,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x16,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x16,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x16,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x16,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x16,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x16,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x16,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x16,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x16,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x16,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x16,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x16,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x16,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x16,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x16,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x16,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x11,0xc9,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x11,0xc9,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x11,0xc9,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x11,0xc9,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x11,0xc9,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x11,0xc9,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x11,0xc9,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x11,0xc9,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x11,0xc9,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x11,0xc9,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x11,0xc9,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x11,0xc9,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x11,0xc9,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x11,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x11,0xc9,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x10,0xc9,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x10,0xc9,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x10,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0e,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0e,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0e,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0e,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0e,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0e,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0e,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0e,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0e,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0e,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0e,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0e,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0e,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0e,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0e,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0e,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x06,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x06,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x06,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x06,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x06,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x06,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x06,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x06,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x06,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x06,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x06,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x06,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x06,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x06,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x06,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x06,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x06,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x06,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0a,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0a,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0a,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0a,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0a,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0a,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0a,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0a,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0a,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0a,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0a,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0a,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0a,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0a,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0a,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0a,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0c,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0c,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0c,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0c,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0c,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0c,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0c,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0c,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0c,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0c,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0c,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0c,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0c,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0c,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0c,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0c,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x48,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x48,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x48,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x48,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x48,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x48,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x48,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x60,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x60,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x60,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x60,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x60,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x60,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x60,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x52,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x52,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x52,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x52,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x52,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x52,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x52,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x52,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x52,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x52,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x52,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, -1, v2, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, -1, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xca,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmaak_f32 v6, 0.5, v5, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmaak_f32 v6, 0.5, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xca,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x40,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x40,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x40,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x40,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x40,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x40,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x40,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x62,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x62,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x62,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x62,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x62,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x62,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x62,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x54,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x54,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x54,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x54,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x54,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x54,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x54,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x56,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x56,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x56,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x56,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x56,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x56,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x56,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_cndmask_b32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xca,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_cndmask_b32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xca,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_cndmask_b32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xca,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xca,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_cndmask_b32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xca,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_cndmask_b32 v255, s105, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x51,0xca,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_cndmask_b32 v255, s1, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x51,0xca,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x51,0xca,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x51,0xca,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x51,0xca,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_cndmask_b32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xca,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x51,0xca,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x51,0xca,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xca,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xca,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xca,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4e,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4e,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4e,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4e,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4e,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4e,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x46,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x46,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x46,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x46,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x46,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x46,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x46,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4a,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4a,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4a,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4a,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4a,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4a,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4c,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4c,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4c,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4c,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4c,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4c,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x48,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x48,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x48,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x48,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x48,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x48,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x48,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x60,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x60,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x60,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x60,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x60,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x60,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x60,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_cndmask_b32 v6, -1, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_cndmask_b32 v6, -1, v2 ; encoding: [0xf0,0x06,0x52,0xc8,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_cndmask_b32 v6, 0.5, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_cndmask_b32 v6, 0.5, v5 ; encoding: [0xc1,0x08,0x52,0xc8,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x42,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x42,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x42,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x42,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x42,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x42,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x42,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x42,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x42,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x40,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x40,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x40,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x40,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x40,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x40,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x40,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x45,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x45,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x45,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x45,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x45,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x45,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x45,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x45,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x45,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x62,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x62,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x62,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x62,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x62,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x62,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x62,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x54,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x54,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x54,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x54,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x54,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x54,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x54,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x56,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x56,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x56,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x56,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x56,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x56,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x56,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x51,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x51,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x51,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x51,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x51,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x51,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x51,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x46,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x46,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x46,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x46,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x46,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x46,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x46,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x08,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x08,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x08,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x08,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x08,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x08,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x08,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x08,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x08,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x08,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x08,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x08,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x08,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x08,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x08,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x08,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x08,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x20,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x20,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x20,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x20,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x20,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x20,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x20,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x20,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x20,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x20,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x20,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x20,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x20,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x20,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x20,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x20,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x20,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x20,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x12,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x12,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x12,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x12,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x12,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x12,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x12,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x12,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x12,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x12,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x12,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x12,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x12,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x12,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x12,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x12,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x12,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x12,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x02,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x02,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x02,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x02,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x02,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x02,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x02,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x02,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x02,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x02,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x02,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x02,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x02,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x02,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x02,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x02,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x02,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x02,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x00,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x00,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x00,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x00,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x00,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x00,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x00,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x00,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x00,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x00,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x00,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x00,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x00,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x00,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x00,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x00,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x00,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x00,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4 ; encoding: [0x04,0xff,0x05,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4 ; encoding: [0x01,0xff,0x05,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4 ; encoding: [0xff,0xff,0x05,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4 ; encoding: [0x02,0xff,0x05,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4 ; encoding: [0x03,0xff,0x05,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4 ; encoding: [0x69,0xfe,0x05,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4 ; encoding: [0x01,0xfe,0x05,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4 ; encoding: [0x7b,0xfe,0x05,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4 ; encoding: [0x7f,0xfe,0x05,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4 ; encoding: [0x7e,0xfe,0x05,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4 ; encoding: [0x7d,0xfe,0x05,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4 ; encoding: [0x6b,0xfe,0x05,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4 ; encoding: [0x6a,0xfe,0x05,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 ; encoding: [0xff,0xfe,0x05,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4 ; encoding: [0xfd,0xfe,0x05,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4 ; encoding: [0xf0,0x06,0x04,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4 ; encoding: [0xc1,0x08,0x04,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0x7c,0x0a,0x04,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x22,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x22,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x22,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x22,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x22,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x22,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x22,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x22,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x22,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x22,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x22,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x22,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x22,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x22,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x22,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x22,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x22,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x22,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x14,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x14,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x14,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x14,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x14,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x14,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x14,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x14,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x14,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x14,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x14,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x14,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x14,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x14,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x14,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x14,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x14,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x14,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x16,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x16,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x16,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x16,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x16,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x16,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x16,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x16,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x16,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x16,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x16,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x16,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x16,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x16,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x16,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x16,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x16,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x16,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x11,0xc8,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x11,0xc8,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x11,0xc8,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x11,0xc8,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x11,0xc8,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x11,0xc8,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x11,0xc8,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x11,0xc8,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x11,0xc8,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x11,0xc8,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x11,0xc8,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x11,0xc8,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x11,0xc8,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x11,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x11,0xc8,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x10,0xc8,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x10,0xc8,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x10,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0e,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0e,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0e,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0e,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0e,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0e,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0e,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0e,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0e,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0e,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0e,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0e,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0e,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0e,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0e,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0e,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x06,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x06,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x06,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x06,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x06,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x06,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x06,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x06,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x06,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x06,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x06,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x06,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x06,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x06,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x06,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x06,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x06,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x06,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0a,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0a,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0a,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0a,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0a,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0a,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0a,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0a,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0a,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0a,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0a,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0a,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0a,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0a,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0a,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0a,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0c,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0c,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0c,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0c,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0c,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0c,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0c,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0c,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0c,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0c,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0c,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0c,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0c,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0c,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0c,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0c,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x89,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x89,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x89,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x89,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x89,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x89,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x89,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x89,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x89,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x89,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x89,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x89,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x89,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x89,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x89,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x89,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x89,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x88,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v1, v255 ; encoding: [0x04,0xff,0xa1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v255, v255 ; encoding: [0x01,0xff,0xa1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v2, v255 ; encoding: [0xff,0xff,0xa1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v3, v255 ; encoding: [0x02,0xff,0xa1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v4, v255 ; encoding: [0x03,0xff,0xa1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, null, v255 ; encoding: [0xff,0xfe,0xa1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_nc_u32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v1, v255 ; encoding: [0x04,0xff,0x93,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v255, v255 ; encoding: [0x01,0xff,0x93,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v2, v255 ; encoding: [0xff,0xff,0x93,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v3, v255 ; encoding: [0x02,0xff,0x93,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v4, v255 ; encoding: [0x03,0xff,0x93,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, null, v255 ; encoding: [0xff,0xfe,0x93,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xf0,0xfe,0x93,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, 0.5, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, 0.5, v4 ; encoding: [0xc1,0xfe,0x93,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_cndmask_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_cndmask_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x92,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456 ; encoding: [0x04,0xff,0x83,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456 ; encoding: [0x01,0xff,0x83,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456 ; encoding: [0xff,0xff,0x83,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456 ; encoding: [0x02,0xff,0x83,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456 ; encoding: [0x03,0xff,0x83,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456 ; encoding: [0x69,0xfe,0x83,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456 ; encoding: [0x01,0xfe,0x83,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456 ; encoding: [0x7b,0xfe,0x83,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456 ; encoding: [0x7f,0xfe,0x83,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456 ; encoding: [0x7e,0xfe,0x83,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456 ; encoding: [0x7d,0xfe,0x83,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456 ; encoding: [0x6b,0xfe,0x83,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456 ; encoding: [0x6a,0xfe,0x83,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456 ; encoding: [0xff,0xfe,0x83,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456 ; encoding: [0xfd,0xfe,0x83,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456 ; encoding: [0xf0,0xfe,0x83,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456 ; encoding: [0xc1,0xfe,0x83,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456 ; encoding: [0x7c,0x08,0x82,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x81,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x81,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x81,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x81,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x81,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x81,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x81,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmac_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmac_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x81,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x81,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x81,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmac_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmac_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x81,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x81,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x81,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmac_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmac_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x81,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmac_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmac_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x81,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmac_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmac_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x81,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x81,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmac_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmac_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x80,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4 ; encoding: [0x04,0xff,0x85,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4 ; encoding: [0x01,0xff,0x85,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4 ; encoding: [0xff,0xff,0x85,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4 ; encoding: [0x02,0xff,0x85,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4 ; encoding: [0x03,0xff,0x85,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4 ; encoding: [0x69,0xfe,0x85,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4 ; encoding: [0x01,0xfe,0x85,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4 ; encoding: [0x7b,0xfe,0x85,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4 ; encoding: [0x7f,0xfe,0x85,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4 ; encoding: [0x7e,0xfe,0x85,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4 ; encoding: [0x7d,0xfe,0x85,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4 ; encoding: [0x6b,0xfe,0x85,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4 ; encoding: [0x6a,0xfe,0x85,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 ; encoding: [0xff,0xfe,0x85,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4 ; encoding: [0xfd,0xfe,0x85,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4 ; encoding: [0xf0,0xfe,0x85,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4 ; encoding: [0xc1,0xfe,0x85,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0x7c,0x08,0x84,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v1, v255 ; encoding: [0x04,0xff,0xa3,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v255, v255 ; encoding: [0x01,0xff,0xa3,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v2, v255 ; encoding: [0xff,0xff,0xa3,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v3, v255 ; encoding: [0x02,0xff,0xa3,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v4, v255 ; encoding: [0x03,0xff,0xa3,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa3,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa3,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa3,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa3,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa3,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa3,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa3,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa3,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, null, v255 ; encoding: [0xff,0xfe,0xa3,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa3,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa3,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa3,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshlrev_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshlrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa2,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x95,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x95,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x95,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x95,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x95,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x95,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x95,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_num_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_num_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x95,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x95,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x95,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_num_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_num_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x95,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x95,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x95,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_num_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_num_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x95,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_num_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_num_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x95,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_num_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x95,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x95,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_num_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x94,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x97,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x97,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x97,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x97,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x97,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x97,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x97,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_num_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_num_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x97,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x97,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x97,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_num_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_num_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x97,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x97,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x97,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_num_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_num_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x97,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_num_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_num_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x97,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_num_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x97,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x97,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_num_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x96,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x91,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x91,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x91,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x91,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x91,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x91,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x91,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0xfe,0x91,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0xfe,0x91,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x08,0x90,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8f,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8f,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8f,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8f,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8f,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8f,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8f,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8f,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8f,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8f,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8f,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8f,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8f,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8f,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8f,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8f,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8f,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8e,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x87,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x87,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x87,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x87,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x87,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x87,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x87,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x87,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x87,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x87,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x87,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x87,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x87,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x87,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x87,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x87,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x87,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x86,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8b,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8b,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8b,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8b,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8b,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8b,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8b,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8b,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8b,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8b,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8b,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8b,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8b,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8b,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8b,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8b,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8b,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8a,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8d,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8d,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8d,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8d,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8d,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8d,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8d,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_subrev_f32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_subrev_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8d,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8d,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8d,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_subrev_f32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_subrev_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8d,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8d,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8d,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_subrev_f32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_subrev_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8d,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_subrev_f32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_subrev_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8d,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_subrev_f32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_subrev_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8d,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8d,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_subrev_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_subrev_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8c,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x88,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x88,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x88,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x88,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x88,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x88,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x88,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x88,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x88,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x88,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x88,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x88,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x88,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x88,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x88,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x88,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x88,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x88,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa0,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa0,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa0,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa0,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa0,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa0,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa0,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa0,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa0,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa0,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa0,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa0,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa0,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa0,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa0,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa0,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x92,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x92,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x92,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x92,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x92,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x92,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x92,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x92,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x92,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x92,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x92,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x92,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x92,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x92,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x92,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x92,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x92,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x92,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x82,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x82,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x82,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x82,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x82,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x82,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x82,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x82,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x82,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x82,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x82,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x82,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x82,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x82,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x82,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x82,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x82,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x82,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x80,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x80,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x80,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x80,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x80,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x80,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x80,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x80,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x80,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x80,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x80,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x80,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x80,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x80,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x80,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x80,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x80,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x80,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x85,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x85,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x85,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x85,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x85,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x85,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x85,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x85,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x85,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x85,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x85,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x85,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x85,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x85,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x84,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x84,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x84,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xa2,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xa2,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xa2,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xa2,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xa2,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xa2,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xa2,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa2,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa2,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa2,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa2,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa2,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa2,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xa2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa2,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa2,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa2,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x94,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x94,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x94,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x94,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x94,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x94,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x94,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x94,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x94,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x94,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x94,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x94,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x94,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x94,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x94,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x94,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x94,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x94,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x96,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x96,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x96,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x96,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x96,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x96,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x96,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x96,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x96,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x96,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x96,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x96,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x96,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x96,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x96,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x96,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x96,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x96,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xca,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xca,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xca,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xca,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xca,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x91,0xca,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x91,0xca,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x91,0xca,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x91,0xca,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x91,0xca,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xca,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x91,0xca,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x91,0xca,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xca,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x90,0xca,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x90,0xca,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x90,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8e,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8e,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8e,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8e,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8e,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8e,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8e,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8e,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8e,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8e,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8e,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8e,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8e,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8e,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8e,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8e,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x86,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x86,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x86,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x86,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x86,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x86,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x86,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x86,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x86,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x86,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x86,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x86,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x86,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x86,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x86,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x86,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x86,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x86,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8a,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8a,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8a,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8a,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8a,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8a,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8a,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8a,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8a,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8a,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8a,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8a,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8a,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8a,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8a,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8a,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8c,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8c,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8c,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8c,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8c,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8c,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8c,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8c,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8c,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8c,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8c,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8c,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8c,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8c,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8c,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8c,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xca,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xca,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xca,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xca,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xca,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xca,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xca,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xca,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xca,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xca,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xca,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xca,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xca,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xca,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xca,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xca,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x08,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x08,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x08,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x08,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x08,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x08,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x08,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x08,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x08,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x08,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x08,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x08,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x08,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_f32 v6, null, v255 ; encoding: [0xff,0x00,0x08,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x08,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x08,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x08,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_add_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_add_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x08,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v6, v1, v255 ; encoding: [0x04,0x01,0x20,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v6, v255, v255 ; encoding: [0x01,0x01,0x20,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v6, v2, v255 ; encoding: [0xff,0x01,0x20,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v6, v3, v255 ; encoding: [0x02,0x01,0x20,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v6, v4, v255 ; encoding: [0x03,0x01,0x20,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v6, s1, v255 ; encoding: [0x69,0x00,0x20,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v6, s105, v255 ; encoding: [0x01,0x00,0x20,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x20,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x20,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x20,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v6, m0, v255 ; encoding: [0x7d,0x00,0x20,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x20,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x20,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v255 ; encoding: [0xff,0x00,0x20,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v6, -1, v255 ; encoding: [0xfd,0x00,0x20,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x20,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x20,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_add_nc_u32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_add_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x20,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x12,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x12,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x12,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x12,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x12,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v6, s105, v255 ; encoding: [0x69,0x00,0x12,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v6, s1, v255 ; encoding: [0x01,0x00,0x12,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v6, ttmp15, v255 ; encoding: [0x7b,0x00,0x12,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v6, exec_hi, v255 ; encoding: [0x7f,0x00,0x12,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v6, exec_lo, v255 ; encoding: [0x7e,0x00,0x12,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x12,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0x00,0x12,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0x00,0x12,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v255 ; encoding: [0xff,0x00,0x12,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x12,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x12,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x12,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_cndmask_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_cndmask_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x12,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456 ; encoding: [0x04,0x01,0x02,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456 ; encoding: [0x01,0x01,0x02,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456 ; encoding: [0xff,0x01,0x02,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456 ; encoding: [0x02,0x01,0x02,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456 ; encoding: [0x03,0x01,0x02,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456 ; encoding: [0x69,0x00,0x02,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456 ; encoding: [0x01,0x00,0x02,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456 ; encoding: [0x7b,0x00,0x02,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456 ; encoding: [0x7f,0x00,0x02,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456 ; encoding: [0x7e,0x00,0x02,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456 ; encoding: [0x7d,0x00,0x02,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456 ; encoding: [0x6b,0x00,0x02,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456 ; encoding: [0x6a,0x00,0x02,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456 ; encoding: [0xff,0x00,0x02,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456 ; encoding: [0xfd,0x00,0x02,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456 ; encoding: [0xf0,0x00,0x02,0xca,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456 ; encoding: [0xc1,0x00,0x02,0xca,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456 ; encoding: [0x7c,0x00,0x02,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x00,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x00,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x00,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x00,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x00,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x00,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x00,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x00,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x00,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x00,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x00,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x00,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x00,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmac_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmac_f32 v6, null, v255 ; encoding: [0xff,0x00,0x00,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x00,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x00,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x00,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_fmac_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_fmac_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x00,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0x01,0x04,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0x01,0x04,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0x01,0x04,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0x01,0x04,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0x01,0x04,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0x00,0x04,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0x00,0x04,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0x00,0x04,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0x00,0x04,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0x00,0x04,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0x00,0x04,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0x00,0x04,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0x00,0x04,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0x00,0x04,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0x00,0x04,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x00,0x04,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x00,0x04,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x00,0x04,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x22,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x22,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x22,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x22,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x22,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v6, s1, v255 ; encoding: [0x69,0x00,0x22,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v6, s105, v255 ; encoding: [0x01,0x00,0x22,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x22,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x22,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x22,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x22,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x22,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x22,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v255 ; encoding: [0xff,0x00,0x22,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x22,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x22,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x22,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_lshlrev_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_lshlrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x22,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x14,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x14,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x14,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x14,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x14,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x14,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x14,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x14,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x14,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x14,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x14,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x14,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x14,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_num_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_num_f32 v6, null, v255 ; encoding: [0xff,0x00,0x14,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x14,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x14,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x14,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_max_num_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_max_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x14,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x16,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x16,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x16,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x16,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x16,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x16,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x16,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x16,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x16,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x16,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x16,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x16,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x16,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_num_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_num_f32 v6, null, v255 ; encoding: [0xff,0x00,0x16,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x16,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x16,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x16,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_min_num_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_min_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x16,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0x01,0x10,0xca,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0x01,0x10,0xca,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0x01,0x10,0xca,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0x01,0x10,0xca,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0x01,0x10,0xca,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0x00,0x10,0xca,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0x00,0x10,0xca,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0x00,0x10,0xca,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0x00,0x10,0xca,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0x00,0x10,0xca,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0x00,0x10,0xca,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0x00,0x10,0xca,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0x00,0x10,0xca,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0x00,0x10,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0x00,0x10,0xca,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x00,0x10,0xca,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x00,0x10,0xca,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x00,0x10,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0e,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0e,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0e,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0e,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0e,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0e,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0e,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0e,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0e,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0e,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0e,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0e,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0e,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0e,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0e,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0e,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x06,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x06,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x06,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x06,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x06,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x06,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x06,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x06,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x06,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x06,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x06,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x06,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x06,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_f32 v6, null, v255 ; encoding: [0xff,0x00,0x06,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x06,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x06,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x06,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_mul_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_mul_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x06,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0a,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0a,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0a,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0a,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0a,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0a,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0a,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0a,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0a,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0a,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0a,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0a,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0a,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0a,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0a,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0a,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_sub_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_sub_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0c,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0c,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0c,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0c,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0c,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0c,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0c,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0c,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0c,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0c,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0c,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0c,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0c,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_subrev_f32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_subrev_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0c,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0c,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0c,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_subrev_f32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_subrev_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xc9,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xc9,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xc9,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xc9,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xc9,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xc9,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xc9,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xc9,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xc9,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xc9,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xc9,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xc9,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xc9,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xc9,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xc9,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xc9,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xc8,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xc8,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xc8,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xc8,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xc8,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xc8,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xc8,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xc8,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xc8,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xc8,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xc8,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xc8,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xc8,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xc8,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xc8,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xc8,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x48,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x48,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x48,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x48,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x48,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x48,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x48,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x60,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x60,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x60,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x60,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x60,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x60,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x60,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x52,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x52,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x52,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x52,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x52,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x52,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x52,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x52,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x52,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x52,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x52,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x42,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x42,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x42,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x42,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x42,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x42,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x42,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x42,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x42,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x40,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x40,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x40,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x40,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x40,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x40,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x40,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x45,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x45,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x45,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x45,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x45,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x45,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x45,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x45,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x45,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x62,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x62,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x62,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x62,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x62,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x62,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x62,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x54,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x54,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x54,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x54,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x54,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x54,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x54,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x56,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x56,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x56,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x56,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x56,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x56,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x56,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xc9,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xc9,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xc9,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xc9,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xc9,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x51,0xc9,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x51,0xc9,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x51,0xc9,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x51,0xc9,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x51,0xc9,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xc9,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x51,0xc9,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x51,0xc9,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xc9,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xc9,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xc9,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4e,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4e,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4e,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4e,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4e,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4e,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x46,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x46,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x46,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x46,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x46,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x46,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x46,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4a,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4a,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4a,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4a,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4a,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4a,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4c,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4c,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4c,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4c,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4c,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4c,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x88,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x88,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x88,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x88,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x88,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x88,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x88,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x88,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x88,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x88,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x88,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x88,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x88,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x88,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x88,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x88,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x88,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x88,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa0,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa0,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa0,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa0,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa0,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa0,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa0,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa0,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa0,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa0,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa0,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa0,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa0,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa0,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa0,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa0,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x92,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x92,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x92,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x92,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x92,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x92,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x92,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x92,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x92,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x92,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x92,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x92,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x92,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x92,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x92,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x92,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x92,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x92,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x82,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x82,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x82,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x82,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x82,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x82,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x82,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x82,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x82,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x82,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x82,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x82,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x82,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x82,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x82,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x82,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x82,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x82,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x80,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x80,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x80,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x80,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x80,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x80,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x80,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x80,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x80,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x80,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x80,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x80,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x80,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x80,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x80,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x80,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x80,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x80,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x85,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x85,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x85,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x85,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x85,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x85,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x85,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x85,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x85,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x85,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x85,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x85,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x85,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x85,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x84,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x84,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x84,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xa2,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xa2,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xa2,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xa2,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xa2,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xa2,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xa2,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa2,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa2,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa2,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa2,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa2,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa2,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xa2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa2,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa2,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa2,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x94,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x94,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x94,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x94,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x94,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x94,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x94,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x94,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x94,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x94,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x94,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x94,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x94,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x94,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x94,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x94,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x94,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x94,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x96,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x96,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x96,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x96,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x96,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x96,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x96,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x96,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x96,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x96,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x96,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x96,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x96,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x96,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x96,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x96,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x96,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x96,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1
+// GFX12: v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xc9,0x01,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255
+// GFX12: v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xc9,0xff,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2
+// GFX12: v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xc9,0x02,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xc9,0x03,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4
+// GFX12: v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xc9,0x04,0x01,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1
+// GFX12: v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x91,0xc9,0x01,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105
+// GFX12: v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x91,0xc9,0x69,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x91,0xc9,0x6a,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x91,0xc9,0x6b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x91,0xc9,0x7b,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0
+// GFX12: v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xc9,0x7d,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x91,0xc9,0x7e,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x91,0xc9,0x7f,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1
+// GFX12: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xc9,0xc1,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x90,0xc9,0xf0,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x90,0xc9,0xfd,0x00,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x90,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8e,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8e,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8e,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8e,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8e,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8e,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8e,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8e,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8e,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8e,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8e,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8e,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8e,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8e,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8e,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8e,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x86,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x86,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x86,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x86,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x86,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x86,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x86,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x86,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x86,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x86,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x86,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x86,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x86,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x86,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x86,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x86,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x86,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x86,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8a,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8a,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8a,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8a,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8a,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8a,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8a,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8a,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8a,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8a,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8a,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8a,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8a,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8a,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8a,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8a,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8c,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8c,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8c,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8c,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8c,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8c,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8c,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8c,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8c,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8c,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8c,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8c,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8c,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8c,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8c,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8c,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2e,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2e,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2e,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2e,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2e,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2e,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2e,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2e,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2e,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2e,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2e,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2e,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2e,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2e,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2e,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2e,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6e,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6e,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6e,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6e,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6e,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6e,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2e,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2e,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2e,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2e,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2e,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2e,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2e,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2e,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2e,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2e,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2e,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2e,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2e,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2e,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2e,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2e,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_i32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xaf,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_i32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xaf,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_i32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xaf,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_i32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xaf,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_i32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xaf,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_i32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xaf,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_i32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xaf,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_i32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xaf,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xaf,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xaf,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_i32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xaf,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xaf,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xaf,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_i32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xaf,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_i32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xaf,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_i32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xaf,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_i32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xaf,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xae,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xae,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xae,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xae,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xae,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xae,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xae,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xae,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xae,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xae,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xae,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xae,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xae,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xae,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xae,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xae,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xae,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xae,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xae,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x2e,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x2e,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x2e,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x2e,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x2e,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x2e,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x2e,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2e,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2e,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2e,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2e,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2e,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2e,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_i32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_i32 v6, null, v255 ; encoding: [0xff,0x00,0x2e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2e,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2e,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2e,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_max_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_max_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x6e,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x6e,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6e,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6e,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6e,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6e,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xae,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xae,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xae,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xae,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xae,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xae,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xae,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xae,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xae,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xae,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xae,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xae,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xae,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xae,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xae,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xae,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xae,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xae,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x30,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x30,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x30,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x30,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x30,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x30,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x30,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x30,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x30,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x30,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x30,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x30,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x30,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x30,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x30,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x30,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x30,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x30,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x70,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x70,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x70,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x70,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x70,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x70,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x70,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x70,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x70,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x70,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x70,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x70,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x70,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x70,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x30,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x30,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x30,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x30,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x30,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x30,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x30,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x30,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x30,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x30,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x30,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x30,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x30,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x30,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x30,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x30,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x30,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x30,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_i32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xb1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_i32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xb1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_i32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xb1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_i32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xb1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_i32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xb1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_i32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xb1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_i32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xb1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_i32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xb1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xb1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xb1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_i32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xb1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xb1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xb1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_i32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xb1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_i32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xb1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_i32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xb1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_i32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xb1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xb0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xb0,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xb0,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xb0,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xb0,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xb0,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xb0,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xb0,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xb0,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xb0,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xb0,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xb0,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xb0,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xb0,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xb0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xb0,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xb0,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xb0,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xb0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x30,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x30,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x30,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x30,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x30,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x30,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x30,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x30,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x30,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x30,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x30,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x30,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x30,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_i32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_i32 v6, null, v255 ; encoding: [0xff,0x00,0x30,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x30,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x30,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x30,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_min_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_min_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x30,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x70,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x70,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x70,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x70,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x70,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x70,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x70,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xb0,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xb0,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xb0,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xb0,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xb0,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xb0,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xb0,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xb0,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xb0,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xb0,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xb0,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xb0,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xb0,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xb0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xb0,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xb0,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xb0,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xb0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x28,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x28,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x28,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x28,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x28,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x28,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x28,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x28,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x28,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x28,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x28,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x28,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x28,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x28,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x28,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x28,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x28,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x28,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x68,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x68,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x68,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x68,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x68,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x68,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x68,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x68,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x68,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x68,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x68,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x68,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x68,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x68,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x28,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x28,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x28,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x28,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x28,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x28,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x28,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x28,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x28,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x28,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x28,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x28,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x28,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x28,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x28,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x28,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x28,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x28,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v1, v255 ; encoding: [0x04,0xff,0xa9,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v255, v255 ; encoding: [0x01,0xff,0xa9,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v2, v255 ; encoding: [0xff,0xff,0xa9,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v3, v255 ; encoding: [0x02,0xff,0xa9,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v4, v255 ; encoding: [0x03,0xff,0xa9,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa9,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa9,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa9,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa9,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa9,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa9,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa9,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa9,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, null, v255 ; encoding: [0xff,0xfe,0xa9,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa9,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa9,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa9,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_nc_u32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa8,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa8,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa8,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa8,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa8,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa8,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa8,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa8,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa8,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa8,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa8,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa8,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa8,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa8,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa8,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa8,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa8,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v6, v1, v255 ; encoding: [0x04,0x01,0x28,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v6, v255, v255 ; encoding: [0x01,0x01,0x28,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v6, v2, v255 ; encoding: [0xff,0x01,0x28,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v6, v3, v255 ; encoding: [0x02,0x01,0x28,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v6, v4, v255 ; encoding: [0x03,0x01,0x28,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v6, s1, v255 ; encoding: [0x69,0x00,0x28,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v6, s105, v255 ; encoding: [0x01,0x00,0x28,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x28,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x28,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x28,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v6, m0, v255 ; encoding: [0x7d,0x00,0x28,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x28,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x28,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v255 ; encoding: [0xff,0x00,0x28,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v6, -1, v255 ; encoding: [0xfd,0x00,0x28,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x28,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x28,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_sub_nc_u32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_sub_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x28,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x68,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x68,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x68,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x68,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x68,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x68,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x68,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa8,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa8,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa8,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa8,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa8,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa8,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa8,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa8,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa8,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa8,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa8,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa8,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa8,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa8,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa8,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa8,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x2a,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x2a,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x2a,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x2a,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x2a,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x2a,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x2a,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2a,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2a,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2a,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2a,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2a,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2a,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x2a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2a,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2a,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2a,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x6a,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x6a,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6a,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6a,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6a,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6a,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x6a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x6a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x2a,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x2a,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x2a,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x2a,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x2a,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x2a,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x2a,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2a,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2a,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2a,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2a,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2a,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2a,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x2a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2a,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2a,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2a,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v1, v255 ; encoding: [0x04,0xff,0xab,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v255, v255 ; encoding: [0x01,0xff,0xab,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v2, v255 ; encoding: [0xff,0xff,0xab,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v3, v255 ; encoding: [0x02,0xff,0xab,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v4, v255 ; encoding: [0x03,0xff,0xab,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s105, v255 ; encoding: [0x69,0xfe,0xab,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s1, v255 ; encoding: [0x01,0xfe,0xab,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xab,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xab,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xab,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xab,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xab,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xab,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, null, v255 ; encoding: [0xff,0xfe,0xab,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xab,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xab,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xab,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshrrev_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshrrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xaa,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xaa,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xaa,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xaa,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xaa,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xaa,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xaa,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xaa,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xaa,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xaa,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xaa,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xaa,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xaa,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xaa,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xaa,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xaa,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xaa,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xaa,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xaa,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x2a,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x2a,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x2a,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x2a,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x2a,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v6, s1, v255 ; encoding: [0x69,0x00,0x2a,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v6, s105, v255 ; encoding: [0x01,0x00,0x2a,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2a,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2a,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2a,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2a,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2a,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2a,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v255 ; encoding: [0xff,0x00,0x2a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2a,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2a,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2a,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_lshrrev_b32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_lshrrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x6a,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x6a,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6a,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6a,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6a,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6a,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xaa,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xaa,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xaa,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xaa,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xaa,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xaa,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xaa,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xaa,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xaa,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xaa,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xaa,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xaa,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xaa,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xaa,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xaa,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xaa,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xaa,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xaa,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2c,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2c,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2c,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2c,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2c,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2c,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2c,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2c,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2c,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2c,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2c,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2c,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2c,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2c,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2c,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2c,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_add_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6c,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6c,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6c,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6c,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6c,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6c,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_cndmask_b32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2c,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2c,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2c,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2c,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2c,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2c,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2c,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2c,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2c,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2c,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2c,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2c,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2c,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2c,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2c,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2c,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_fmac_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v1, v255
+// GFX12: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xad,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v255, v255
+// GFX12: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xad,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v2, v255
+// GFX12: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xad,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v3, v255
+// GFX12: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xad,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v4, v255
+// GFX12: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xad,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s105, v255
+// GFX12: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xad,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s1, v255
+// GFX12: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xad,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, ttmp15, v255
+// GFX12: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xad,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xad,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xad,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, m0, v255
+// GFX12: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xad,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_hi, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xad,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_lo, v255
+// GFX12: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xad,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, null, v255
+// GFX12: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xad,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, -1, v255
+// GFX12: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xad,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, 0.5, v3
+// GFX12: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xad,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, src_scc, v4
+// GFX12: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xad,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_ashrrev_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_ashrrev_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xac,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xac,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xac,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xac,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xac,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xac,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xac,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xac,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xac,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xac,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xac,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xac,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xac,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xac,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xac,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xac,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xac,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xac,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_max_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xac,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xca,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xca,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xca,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xca,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xca,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xca,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xca,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xca,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xca,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xca,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xca,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xca,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xca,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xca,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xca,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xca,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_min_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v6, v1, v255
+// GFX12: v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x2c,0xca,0x01,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v6, v255, v255
+// GFX12: v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x2c,0xca,0xff,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v6, v2, v255
+// GFX12: v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x2c,0xca,0x02,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v6, v3, v255
+// GFX12: v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x2c,0xca,0x03,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v6, v4, v255
+// GFX12: v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x2c,0xca,0x04,0xff,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v6, s1, v255
+// GFX12: v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x2c,0xca,0x01,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v6, s105, v255
+// GFX12: v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x2c,0xca,0x69,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v6, vcc_lo, v255
+// GFX12: v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2c,0xca,0x6a,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v6, vcc_hi, v255
+// GFX12: v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2c,0xca,0x6b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v6, ttmp15, v255
+// GFX12: v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2c,0xca,0x7b,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v6, m0, v255
+// GFX12: v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2c,0xca,0x7d,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v6, exec_lo, v255
+// GFX12: v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2c,0xca,0x7e,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v6, exec_hi, v255
+// GFX12: v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2c,0xca,0x7f,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v255
+// GFX12: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v255 ; encoding: [0xff,0x00,0x2c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v6, -1, v255
+// GFX12: v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2c,0xca,0xc1,0xfe,0x07,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v6, 0.5, v3
+// GFX12: v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2c,0xca,0xf0,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v6, src_scc, v4
+// GFX12: v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2c,0xca,0xfd,0x08,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v6, null :: v_dual_ashrrev_i32 v255, 0xaf123456, v5
+// GFX12: v_dual_mov_b32 v6, null :: v_dual_ashrrev_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xc8,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xc8,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xc8,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xc8,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xc8,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xc8,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xc8,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xc8,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xc8,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xc8,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xc8,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xc8,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xc8,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xc8,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xc8,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xc8,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_mul_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x6c,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x6c,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6c,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6c,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6c,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6c,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_sub_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3
+// GFX12: v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xac,0xc9,0x01,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3
+// GFX12: v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xac,0xc9,0xff,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3
+// GFX12: v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xac,0xc9,0x02,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3
+// GFX12: v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xac,0xc9,0x03,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3
+// GFX12: v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xac,0xc9,0x04,0x07,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3
+// GFX12: v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xac,0xc9,0x01,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3
+// GFX12: v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xac,0xc9,0x69,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3
+// GFX12: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xac,0xc9,0x6a,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3
+// GFX12: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xac,0xc9,0x6b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3
+// GFX12: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xac,0xc9,0x7b,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3
+// GFX12: v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xac,0xc9,0x7d,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xac,0xc9,0x7e,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3
+// GFX12: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xac,0xc9,0x7f,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3
+// GFX12: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xac,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3
+// GFX12: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xac,0xc9,0xc1,0x06,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2
+// GFX12: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xac,0xc9,0xf0,0x04,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5
+// GFX12: v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xac,0xc9,0xfd,0x0a,0x06,0xff]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4
+// GFX12: v_dual_subrev_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xac,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd3.s
new file mode 100644
index 0000000000000..1b7699a579c9b
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd3.s
@@ -0,0 +1,19064 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=W64-ERR --implicit-check-not=error: %s
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x10,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x10,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x10,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x10,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x10,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x10,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x10,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x10,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x10,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x10,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x10,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x10,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x10,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x10,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 ; encoding: [0x04,0x21,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x69,0x40,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x01,0x40,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x40,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x40,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x40,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x40,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x40,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x69,0x00,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x01,0x00,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7b,0x00,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x7f,0x00,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x7e,0x00,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x6b,0x00,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x6a,0x00,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v28, -v15, v15, s46 :: v_dual_cndmask_b32 v29, -v13, -v13, s46
+// GFX1250: v_dual_cndmask_b32 v28, -v15, v15, s46 :: v_dual_cndmask_b32 v29, -v13, -v13, s46 ; encoding: [0x0f,0x91,0x24,0xcf,0x0d,0x33,0x0f,0x2e,0x1c,0x0d,0x2e,0x1d]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x69,0x00,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x01,0x00,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x00,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x00,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x00,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x00,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x00,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x69,0x10,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x01,0x10,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7b,0x10,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x7f,0x10,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x7e,0x10,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x6b,0x10,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x6a,0x10,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x69,0xa0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x01,0xa0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7b,0xa0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x7f,0xa0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x7e,0xa0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0xa0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0xa0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x69,0xb0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x01,0xb0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7b,0xb0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x7f,0xb0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x7e,0xb0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0xb0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0xb0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v255, vcc_lo :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_cndmask_b32 v255, v4, v255, vcc_lo :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v255, vcc_lo :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_cndmask_b32 v255, v1, v255, vcc_lo :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x24,0xcf,0xff,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v255, vcc_lo :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_cndmask_b32 v255, v255, v255, vcc_lo :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x24,0xcf,0x02,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v255, vcc_lo :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v255, vcc_lo :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x24,0xcf,0x03,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v255, vcc_lo :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_cndmask_b32 v255, v3, v255, vcc_lo :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x24,0xcf,0x04,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v255, vcc_lo :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_cndmask_b32 v255, s105, v255, vcc_lo :: v_dual_mov_b32 v7, s105 ; encoding: [0x69,0x80,0x24,0xcf,0x69,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v255, vcc_lo :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_cndmask_b32 v255, s1, v255, vcc_lo :: v_dual_mov_b32 v7, s1 ; encoding: [0x01,0x80,0x24,0xcf,0x01,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v255, vcc_lo :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v255, vcc_lo :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7b,0x80,0x24,0xcf,0x7b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x7f,0x80,0x24,0xcf,0x7f,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v255, vcc_lo :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v255, vcc_lo :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x7e,0x80,0x24,0xcf,0x7e,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v255, vcc_lo :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_cndmask_b32 v255, m0, v255, vcc_lo :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x24,0xcf,0x7d,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x6b,0x80,0x24,0xcf,0x6b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x6a,0x80,0x24,0xcf,0x6a,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v255, vcc_lo :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v255, vcc_lo :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x24,0xcf,0xc1,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x69,0x70,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x01,0x70,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x70,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x70,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x70,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x70,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x70,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x69,0x30,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x01,0x30,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x30,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x30,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x30,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x30,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x30,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x69,0x50,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x01,0x50,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x50,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x50,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x50,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x50,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x50,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x69,0x60,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x01,0x60,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x60,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x60,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x60,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x60,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x60,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1 ; encoding: [0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x01,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, v4, v2 :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_fmac_f32 v7, v4, v2 :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x04,0x01,0x00,0xcf,0x01,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, v1, v2 :: v_dual_fmac_f32 v9, v255, v3
+// GFX1250: v_dual_fmac_f32 v7, v1, v2 :: v_dual_fmac_f32 v9, v255, v3 ; encoding: [0x01,0x01,0x00,0xcf,0xff,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, v255, v2 :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_fmac_f32 v7, v255, v2 :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0xff,0x01,0x00,0xcf,0x02,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, v2, v2 :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_fmac_f32 v7, v2, v2 :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x02,0x01,0x00,0xcf,0x03,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, v3, v2 :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_fmac_f32 v7, v3, v2 :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x03,0x01,0x00,0xcf,0x04,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, s105, v2 :: v_dual_fmac_f32 v9, s1, v3
+// GFX1250: v_dual_fmac_f32 v7, s105, v2 :: v_dual_fmac_f32 v9, s1, v3 ; encoding: [0x69,0x00,0x00,0xcf,0x01,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, s1, v2 :: v_dual_fmac_f32 v9, s105, v3
+// GFX1250: v_dual_fmac_f32 v7, s1, v2 :: v_dual_fmac_f32 v9, s105, v3 ; encoding: [0x01,0x00,0x00,0xcf,0x69,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, ttmp15, v2 :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v7, ttmp15, v2 :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7b,0x00,0x00,0xcf,0x6a,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, exec_hi, v2 :: v_dual_fmac_f32 v9, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v7, exec_hi, v2 :: v_dual_fmac_f32 v9, vcc_hi, v3 ; encoding: [0x7f,0x00,0x00,0xcf,0x6b,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, exec_lo, v2 :: v_dual_fmac_f32 v9, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v7, exec_lo, v2 :: v_dual_fmac_f32 v9, ttmp15, v3 ; encoding: [0x7e,0x00,0x00,0xcf,0x7b,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, m0, v2 :: v_dual_fmac_f32 v9, m0, v3
+// GFX1250: v_dual_fmac_f32 v7, m0, v2 :: v_dual_fmac_f32 v9, m0, v3 ; encoding: [0x7d,0x00,0x00,0xcf,0x7d,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, vcc_hi, v2 :: v_dual_fmac_f32 v9, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v7, vcc_hi, v2 :: v_dual_fmac_f32 v9, exec_lo, v3 ; encoding: [0x6b,0x00,0x00,0xcf,0x7e,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, vcc_lo, v2 :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v7, vcc_lo, v2 :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x00,0xcf,0x7f,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, src_scc, v2 :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_fmac_f32 v7, src_scc, v2 :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x00,0xcf,0xc1,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, 0.5, v3 :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v7, 0.5, v3 :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x00,0xcf,0xf0,0x00,0x03,0x00,0x07,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v7, -1, v4 :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v7, -1, v4 :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x00,0xcf,0xfd,0x00,0x04,0x00,0x07,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x00,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x00,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x00,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x00,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x00,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x00,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x00,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x00,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x00,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x00,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x00,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x00,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x00,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x00,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:20
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x14 ; encoding: [0x04,0x21,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x14,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x28,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x28,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x28,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x28,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x28,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x28,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x28,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x28,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x28,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x28,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x28,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x28,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x28,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x28,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x6e
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x6e ; encoding: [0x04,0x21,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6e,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x2c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x2c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x2c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x2c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x2c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x2c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x2c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x2c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x2c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x2c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x2c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x2c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x2c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x2c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:255
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0xff ; encoding: [0x04,0x21,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0xff,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v7, v1, v255 ; encoding: [0x04,0x41,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v7, v255, v255 ; encoding: [0x01,0x41,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v7, v2, v255 ; encoding: [0xff,0x41,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v7, v3, v255 ; encoding: [0x02,0x41,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v7, v4, v255 ; encoding: [0x03,0x41,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v7, s1, v255 ; encoding: [0x69,0x40,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v7, s105, v255 ; encoding: [0x01,0x40,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x40,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x40,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x40,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v7, m0, v255 ; encoding: [0x7d,0x40,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x40,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x40,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v7, -1, v255 ; encoding: [0xfd,0x40,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v7, 0.5, v3 ; encoding: [0xf0,0x40,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v7, src_scc, v4 ; encoding: [0xc1,0x40,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v7, v1, v255 ; encoding: [0x04,0x01,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v7, v255, v255 ; encoding: [0x01,0x01,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v7, v2, v255 ; encoding: [0xff,0x01,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v7, v3, v255 ; encoding: [0x02,0x01,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v7, v4, v255 ; encoding: [0x03,0x01,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v7, s1, v255 ; encoding: [0x69,0x00,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v7, s105, v255 ; encoding: [0x01,0x00,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v7, vcc_lo, v255 ; encoding: [0x7b,0x00,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v7, vcc_hi, v255 ; encoding: [0x7f,0x00,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v7, ttmp15, v255 ; encoding: [0x7e,0x00,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v7, m0, v255 ; encoding: [0x7d,0x00,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v7, exec_lo, v255 ; encoding: [0x6b,0x00,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v255 ; encoding: [0x6a,0x00,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v7, -1, v255 ; encoding: [0xfd,0x00,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v7, 0.5, v3 ; encoding: [0xf0,0x00,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v7, src_scc, v4 ; encoding: [0xc1,0x00,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, vcc_lo ; encoding: [0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v7, v255, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v7, v255, v255, vcc_lo ; encoding: [0x01,0x91,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v7, v2, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v7, v2, v255, vcc_lo ; encoding: [0xff,0x91,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v7, v3, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v7, v3, v255, vcc_lo ; encoding: [0x02,0x91,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v7, v4, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v7, v4, v255, vcc_lo ; encoding: [0x03,0x91,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v7, s105, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v7, s105, v255, vcc_lo ; encoding: [0x69,0x90,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v7, s1, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v7, s1, v255, vcc_lo ; encoding: [0x01,0x90,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v7, ttmp15, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v7, ttmp15, v255, vcc_lo ; encoding: [0x7b,0x90,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v7, exec_hi, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v7, exec_hi, v255, vcc_lo ; encoding: [0x7f,0x90,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v7, exec_lo, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v7, exec_lo, v255, vcc_lo ; encoding: [0x7e,0x90,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v7, m0, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v7, m0, v255, vcc_lo ; encoding: [0x7d,0x90,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v7, vcc_hi, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v7, vcc_hi, v255, vcc_lo ; encoding: [0x6b,0x90,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v255, vcc_lo ; encoding: [0x6a,0x90,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v7, -1, v255, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v7, -1, v255, vcc_lo ; encoding: [0xfd,0x90,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v7, 0.5, v3, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v7, 0.5, v3, vcc_lo ; encoding: [0xf0,0x90,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v7, src_scc, v4, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v7, src_scc, v4, vcc_lo ; encoding: [0xc1,0x90,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v7, v1, v255 ; encoding: [0x04,0x01,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v7, v255, v255 ; encoding: [0x01,0x01,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v7, v2, v255 ; encoding: [0xff,0x01,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v7, v3, v255 ; encoding: [0x02,0x01,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v7, v4, v255 ; encoding: [0x03,0x01,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v7, s1, v255 ; encoding: [0x69,0x00,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v7, s105, v255 ; encoding: [0x01,0x00,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x00,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x00,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x00,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v7, m0, v255 ; encoding: [0x7d,0x00,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x00,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x00,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v7, -1, v255 ; encoding: [0xfd,0x00,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v7, 0.5, v3 ; encoding: [0xf0,0x00,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v7, src_scc, v4 ; encoding: [0xc1,0x00,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v7, v1, v255 ; encoding: [0x04,0x11,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v7, v255, v255 ; encoding: [0x01,0x11,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v7, v2, v255 ; encoding: [0xff,0x11,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v7, v3, v255 ; encoding: [0x02,0x11,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v7, v4, v255 ; encoding: [0x03,0x11,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v7, s1, v255 ; encoding: [0x69,0x10,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v7, s105, v255 ; encoding: [0x01,0x10,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v7, vcc_lo, v255 ; encoding: [0x7b,0x10,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v7, vcc_hi, v255 ; encoding: [0x7f,0x10,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v7, ttmp15, v255 ; encoding: [0x7e,0x10,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v7, m0, v255 ; encoding: [0x7d,0x10,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v7, exec_lo, v255 ; encoding: [0x6b,0x10,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v255 ; encoding: [0x6a,0x10,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v7, -1, v255 ; encoding: [0xfd,0x10,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v7, 0.5, v3 ; encoding: [0xf0,0x10,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v7, src_scc, v4 ; encoding: [0xc1,0x10,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v7, v1, v255 ; encoding: [0x04,0xa1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v7, v255, v255 ; encoding: [0x01,0xa1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v7, v2, v255 ; encoding: [0xff,0xa1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v7, v3, v255 ; encoding: [0x02,0xa1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v7, v4, v255 ; encoding: [0x03,0xa1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v7, s1, v255 ; encoding: [0x69,0xa0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v7, s105, v255 ; encoding: [0x01,0xa0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0xa0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0xa0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v7, ttmp15, v255 ; encoding: [0x7e,0xa0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v7, m0, v255 ; encoding: [0x7d,0xa0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v7, exec_lo, v255 ; encoding: [0x6b,0xa0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v255 ; encoding: [0x6a,0xa0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v7, -1, v255 ; encoding: [0xfd,0xa0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v7, 0.5, v3 ; encoding: [0xf0,0xa0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v7, src_scc, v4 ; encoding: [0xc1,0xa0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v7, v1, v255 ; encoding: [0x04,0xb1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v7, v255, v255 ; encoding: [0x01,0xb1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v7, v2, v255 ; encoding: [0xff,0xb1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v7, v3, v255 ; encoding: [0x02,0xb1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v7, v4, v255 ; encoding: [0x03,0xb1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v7, s1, v255 ; encoding: [0x69,0xb0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v7, s105, v255 ; encoding: [0x01,0xb0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0xb0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0xb0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v7, ttmp15, v255 ; encoding: [0x7e,0xb0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v7, m0, v255 ; encoding: [0x7d,0xb0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v7, exec_lo, v255 ; encoding: [0x6b,0xb0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v255 ; encoding: [0x6a,0xb0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v7, -1, v255 ; encoding: [0xfd,0xb0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v7, 0.5, v3 ; encoding: [0xf0,0xb0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v7, src_scc, v4 ; encoding: [0xc1,0xb0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v25, v8 :: v_dual_mov_b32 v13, v16
+// GFX1250: v_dual_mov_b32 v25, v8 :: v_dual_mov_b32 v13, v16 ; encoding: [0x08,0x81,0x20,0xcf,0x10,0x01,0x00,0x00,0x19,0x00,0x00,0x0d]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v7, v1, v255 ; encoding: [0x04,0x71,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v7, v255, v255 ; encoding: [0x01,0x71,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v7, v2, v255 ; encoding: [0xff,0x71,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v255 ; encoding: [0x02,0x71,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v7, v4, v255 ; encoding: [0x03,0x71,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v7, s1, v255 ; encoding: [0x69,0x70,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v7, s105, v255 ; encoding: [0x01,0x70,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x70,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x70,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x70,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v7, m0, v255 ; encoding: [0x7d,0x70,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x70,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x70,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v7, -1, v255 ; encoding: [0xfd,0x70,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v3 ; encoding: [0xf0,0x70,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v4 ; encoding: [0xc1,0x70,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v7, v1, v255 ; encoding: [0x04,0x31,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v7, v255, v255 ; encoding: [0x01,0x31,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v7, v2, v255 ; encoding: [0xff,0x31,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v7, v3, v255 ; encoding: [0x02,0x31,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v7, v4, v255 ; encoding: [0x03,0x31,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v7, s1, v255 ; encoding: [0x69,0x30,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v7, s105, v255 ; encoding: [0x01,0x30,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x30,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x30,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x30,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v7, m0, v255 ; encoding: [0x7d,0x30,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x30,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x30,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v7, -1, v255 ; encoding: [0xfd,0x30,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v7, 0.5, v3 ; encoding: [0xf0,0x30,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v7, src_scc, v4 ; encoding: [0xc1,0x30,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v7, v1, v255 ; encoding: [0x04,0x51,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v7, v255, v255 ; encoding: [0x01,0x51,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v7, v2, v255 ; encoding: [0xff,0x51,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v7, v3, v255 ; encoding: [0x02,0x51,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v7, v4, v255 ; encoding: [0x03,0x51,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v7, s1, v255 ; encoding: [0x69,0x50,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v7, s105, v255 ; encoding: [0x01,0x50,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x50,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x50,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x50,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v7, m0, v255 ; encoding: [0x7d,0x50,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x50,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x50,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v7, -1, v255 ; encoding: [0xfd,0x50,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v7, 0.5, v3 ; encoding: [0xf0,0x50,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v7, src_scc, v4 ; encoding: [0xc1,0x50,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v7, v1, v255 ; encoding: [0x04,0x61,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v7, v255, v255 ; encoding: [0x01,0x61,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v7, v2, v255 ; encoding: [0xff,0x61,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v7, v3, v255 ; encoding: [0x02,0x61,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v7, v4, v255 ; encoding: [0x03,0x61,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v7, s1, v255 ; encoding: [0x69,0x60,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v7, s105, v255 ; encoding: [0x01,0x60,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x60,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x60,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x60,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v7, m0, v255 ; encoding: [0x7d,0x60,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x60,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x60,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v7, -1, v255 ; encoding: [0xfd,0x60,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v7, 0.5, v3 ; encoding: [0xf0,0x60,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v7, src_scc, v4 ; encoding: [0xc1,0x60,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:254
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0xfe ; encoding: [0x04,0x21,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0xfe,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x1c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x1c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x1c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x1c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x1c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x1c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x1c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x1c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x1c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x1c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x1c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x1c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x1c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x1c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x11
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x11 ; encoding: [0x04,0x21,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x11,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x0c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x0c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x0c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x0c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x0c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x0c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x0c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x0c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x0c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x0c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x0c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x0c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x0c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x0c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x71
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x71 ; encoding: [0x04,0x21,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x71,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x14,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x14,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x14,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x14,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x14,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x14,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x14,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x14,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x14,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x14,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x14,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x14,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x14,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x14,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x82
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x82 ; encoding: [0x04,0x21,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x82,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x18,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x18,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x18,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x18,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x18,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x18,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x18,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x18,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x18,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x18,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x18,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x18,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x18,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x18,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x83
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x83 ; encoding: [0x04,0x21,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x83,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x69,0x70,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x01,0x70,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x70,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x70,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x70,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x70,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x70,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v7, v1, v255 ; encoding: [0x04,0x71,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v7, v255, v255 ; encoding: [0x01,0x71,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v7, v2, v255 ; encoding: [0xff,0x71,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v7, v3, v255 ; encoding: [0x02,0x71,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v7, v4, v255 ; encoding: [0x03,0x71,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v7, s1, v255 ; encoding: [0x69,0x70,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v7, s105, v255 ; encoding: [0x01,0x70,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x70,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x70,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x70,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v7, m0, v255 ; encoding: [0x7d,0x70,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x70,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x70,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v7, -1, v255 ; encoding: [0xfd,0x70,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v7, 0.5, v3 ; encoding: [0xf0,0x70,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v7, src_scc, v4 ; encoding: [0xc1,0x70,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x69,0x80,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x01,0x80,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x80,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x80,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x80,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x80,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x80,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v7, v1, v255 ; encoding: [0x04,0x81,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v7, v255, v255 ; encoding: [0x01,0x81,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v7, v2, v255 ; encoding: [0xff,0x81,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v7, v3, v255 ; encoding: [0x02,0x81,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v7, v4, v255 ; encoding: [0x03,0x81,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v7, s1, v255 ; encoding: [0x69,0x80,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v7, s105, v255 ; encoding: [0x01,0x80,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x80,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x80,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x80,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v7, m0, v255 ; encoding: [0x7d,0x80,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x80,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x80,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v7, -1, v255 ; encoding: [0xfd,0x80,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v7, 0.5, v3 ; encoding: [0xf0,0x80,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v7, src_scc, v4 ; encoding: [0xc1,0x80,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x69,0x40,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x01,0x40,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7b,0x40,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x7f,0x40,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x7e,0x40,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x6b,0x40,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x6a,0x40,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v7, v1, v255 ; encoding: [0x04,0x41,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v7, v255, v255 ; encoding: [0x01,0x41,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v7, v2, v255 ; encoding: [0xff,0x41,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v7, v3, v255 ; encoding: [0x02,0x41,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v7, v4, v255 ; encoding: [0x03,0x41,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v7, s1, v255 ; encoding: [0x69,0x40,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v7, s105, v255 ; encoding: [0x01,0x40,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v7, vcc_lo, v255 ; encoding: [0x7b,0x40,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v7, vcc_hi, v255 ; encoding: [0x7f,0x40,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v7, ttmp15, v255 ; encoding: [0x7e,0x40,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v7, m0, v255 ; encoding: [0x7d,0x40,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v7, exec_lo, v255 ; encoding: [0x6b,0x40,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v255 ; encoding: [0x6a,0x40,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v7, -1, v255 ; encoding: [0xfd,0x40,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v7, 0.5, v3 ; encoding: [0xf0,0x40,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v7, src_scc, v4 ; encoding: [0xc1,0x40,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x69,0x50,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x01,0x50,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7b,0x50,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x7f,0x50,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x7e,0x50,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x6b,0x50,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x6a,0x50,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v7, v1, v255 ; encoding: [0x04,0x51,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v7, v255, v255 ; encoding: [0x01,0x51,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v7, v2, v255 ; encoding: [0xff,0x51,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v7, v3, v255 ; encoding: [0x02,0x51,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v7, v4, v255 ; encoding: [0x03,0x51,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v7, s1, v255 ; encoding: [0x69,0x50,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v7, s105, v255 ; encoding: [0x01,0x50,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v7, vcc_lo, v255 ; encoding: [0x7b,0x50,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v7, vcc_hi, v255 ; encoding: [0x7f,0x50,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v7, ttmp15, v255 ; encoding: [0x7e,0x50,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v7, m0, v255 ; encoding: [0x7d,0x50,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v7, exec_lo, v255 ; encoding: [0x6b,0x50,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v255 ; encoding: [0x6a,0x50,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v7, -1, v255 ; encoding: [0xfd,0x50,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v7, 0.5, v3 ; encoding: [0xf0,0x50,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v7, src_scc, v4 ; encoding: [0xc1,0x50,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x69,0x60,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x01,0x60,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x60,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x60,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x60,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x60,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x60,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v7, v1, v255
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v7, v1, v255 ; encoding: [0x04,0x61,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v7, v255, v255
+// GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v7, v255, v255 ; encoding: [0x01,0x61,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v7, v2, v255
+// GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v7, v2, v255 ; encoding: [0xff,0x61,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v7, v3, v255
+// GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v7, v3, v255 ; encoding: [0x02,0x61,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v7, v4, v255
+// GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v7, v4, v255 ; encoding: [0x03,0x61,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v7, s1, v255
+// GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v7, s1, v255 ; encoding: [0x69,0x60,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v7, s105, v255
+// GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v7, s105, v255 ; encoding: [0x01,0x60,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v7, vcc_lo, v255
+// GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x60,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v7, vcc_hi, v255
+// GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x60,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v7, ttmp15, v255
+// GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x60,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v7, m0, v255
+// GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v7, m0, v255 ; encoding: [0x7d,0x60,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v7, exec_lo, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x60,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v255
+// GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x60,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v7, -1, v255
+// GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v7, -1, v255 ; encoding: [0xfd,0x60,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v7, 0.5, v3
+// GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v7, 0.5, v3 ; encoding: [0xf0,0x60,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v7, src_scc, v4
+// GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v7, src_scc, v4 ; encoding: [0xc1,0x60,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3
+// GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3
+// GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3
+// GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3
+// GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3
+// GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3
+// GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3
+// GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3
+// GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3
+// GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3
+// GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2
+// GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5
+// GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x40,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_add_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x40,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_add_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x40,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_add_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x40,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x40,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_add_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x40,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_add_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x40,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_add_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x40,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x40,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x40,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x40,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_add_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x40,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x40,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x40,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x40,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x84
+// GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x84 ; encoding: [0x04,0x21,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x84,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x44,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x44,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x44,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x44,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x44,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x44,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x44,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x44,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x44,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x44,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x44,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x44,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x44,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x44,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x44,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x85
+// GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x85 ; encoding: [0x04,0x21,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x85,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x50,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x50,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x50,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x50,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x50,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x50,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x50,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x50,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x50,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x50,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x50,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x50,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x50,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x50,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x50,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x86
+// GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x86 ; encoding: [0x04,0x21,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x86,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x58,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x58,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x58,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x58,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x58,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x58,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x58,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x58,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x58,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x58,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x58,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x58,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x58,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x58,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x58,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x87
+// GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x87 ; encoding: [0x04,0x21,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x87,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x54,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x54,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x54,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x54,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x54,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x54,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x54,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x54,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x54,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x54,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x54,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x54,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x54,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x54,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x54,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x88
+// GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x88 ; encoding: [0x04,0x21,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x88,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v254, v4, v2, v10 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v254, v4, v2, v10 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_nc_u32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_nc_u32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_nc_u32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_nc_u32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_nc_u32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_nc_u32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_nc_u32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_fmac_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_fmac_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_fmac_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_fmac_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_fmac_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_fmac_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_fmac_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_fmac_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_fmac_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_fmac_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_fmac_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_fmac_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_fmac_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_fmac_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_fmac_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_lshlrev_b32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_lshlrev_b32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_lshlrev_b32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_lshlrev_b32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_lshlrev_b32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_lshlrev_b32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_max_num_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_max_num_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_max_num_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_max_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_max_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_max_num_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_max_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_min_num_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_min_num_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_min_num_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_min_num_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_min_num_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_min_num_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_min_num_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v255, v10 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_fma_f32 v255, v4, v255, v10 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x4c,0xcf,0x01,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v255, v10 :: v_dual_mov_b32 v7, v255
+// GFX1250: v_dual_fma_f32 v255, v1, v255, v10 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x4c,0xcf,0xff,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v255, v10 :: v_dual_mov_b32 v7, v2
+// GFX1250: v_dual_fma_f32 v255, v255, v255, v10 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x4c,0xcf,0x02,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v255, v10 :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v255, v10 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x4c,0xcf,0x03,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v255, v10 :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_fma_f32 v255, v3, v255, v10 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x4c,0xcf,0x04,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v255, v10 :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_fma_f32 v255, s105, v255, v10 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x4c,0xcf,0x01,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v255, v10 :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_fma_f32 v255, s1, v255, v10 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x4c,0xcf,0x69,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v255, v10 :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v255, v10 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x4c,0xcf,0x6a,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v255, v10 :: v_dual_mov_b32 v7, vcc_hi
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v255, v10 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x4c,0xcf,0x6b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v255, v10 :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v255, v10 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x4c,0xcf,0x7b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v255, v10 :: v_dual_mov_b32 v7, m0
+// GFX1250: v_dual_fma_f32 v255, m0, v255, v10 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x4c,0xcf,0x7d,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v255, v10 :: v_dual_mov_b32 v7, exec_lo
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v255, v10 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x4c,0xcf,0x7e,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v255, v10 :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v255, v10 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x4c,0xcf,0x7f,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v255, v10 :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_fma_f32 v255, src_scc, v255, v10 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x4c,0xcf,0xc1,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_sub_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_sub_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_sub_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_sub_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_sub_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_sub_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_sub_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_subrev_f32 v7, v255, v3
+// GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_subrev_f32 v7, s1, v3
+// GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_subrev_f32 v7, s105, v3
+// GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_subrev_f32 v7, vcc_hi, v3
+// GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_subrev_f32 v7, ttmp15, v3
+// GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_subrev_f32 v7, m0, v3
+// GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_subrev_f32 v7, exec_lo, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_sub_nc_u32 v9, v1, v13
+// GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_lshrrev_b32 v9, v1, v13
+// GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_ashrrev_i32 v9, v1, v13
+// GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_bitop2_b32 v7, v1, v3
+// GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_bitop2_b32 v7, v1, v3 ; encoding: [0x04,0x21,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[252:253], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v8, v1, v3
+// GFX1250: v_dual_fma_f64 v[252:253], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfc,0x03,0x00,0x08]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], s[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x80,0xcf,0x0a,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x80,0xcf,0x03,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v253
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x80,0xcf,0xfd,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x80,0xcf,0x03,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x80,0xcf,0x01,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_fma_f64 v[254:255], s[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x80,0xcf,0x69,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[254:255], v[10:11] :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[254:255], v[10:11] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x80,0xcf,0x6a,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[254:255], v[10:11] :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[254:255], v[10:11] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x80,0xcf,0x7b,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x80,0xcf,0x7f,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x80,0xcf,0xc1,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_sub_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v253, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v5, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v15, v3
+// GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_fma_f32 v9, v1, v14, v4
+// GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x04,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_sub_nc_u32 v9, v1, v14
+// GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_lshrrev_b32 v9, v1, v14
+// GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_ashrrev_i32 v9, v1, v14
+// GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x91
+// GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x91 ; encoding: [0x06,0x21,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x03,0x91,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3
+// GFX1250: v_dual_add_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x84,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x84,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x84,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x84,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x84,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x84,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_add_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x84,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x84,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x84,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x84,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x84,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3
+// GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4
+// GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14
+// GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14
+// GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14
+// GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x92
+// GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x92 ; encoding: [0x06,0x21,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x92,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3
+// GFX1250: v_dual_mul_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x88,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x88,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x88,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x88,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x88,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x88,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_mul_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x88,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x88,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x88,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x88,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x88,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3
+// GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4
+// GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14
+// GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14
+// GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14
+// GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x93
+// GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x93 ; encoding: [0x06,0x21,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x93,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3
+// GFX1250: v_dual_max_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x8c,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x8c,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x8c,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x8c,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x8c,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_max_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x8c,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x8c,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x8c,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x8c,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x8c,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4
+// GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14
+// GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14
+// GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14
+// GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x94
+// GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x94 ; encoding: [0x06,0x21,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x94,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3
+// GFX1250: v_dual_min_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x90,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x90,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x90,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x90,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x90,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x90,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105
+// GFX1250: v_dual_min_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x90,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x90,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x90,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x90,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x90,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3
+// GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2
+// GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5
+// GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4
+// GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14
+// GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14
+// GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14
+// GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x95
+// GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x95 ; encoding: [0x06,0x21,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x95,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+//===----------------------------------------------------------------------===//
+// Neg modifiers support.
+//===----------------------------------------------------------------------===//
+
+v_dual_fma_f32 v0, -v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8
+// GFX1250: v_dual_fma_f32 v0, -v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, v1, -v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8
+// GFX1250: v_dual_fma_f32 v0, v1, -v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x05,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, v1, v2, -v3 :: v_dual_fma_f32 v5, v6, v7, v8
+// GFX1250: v_dual_fma_f32 v0, v1, v2, -v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x09,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, -v6, v7, v8
+// GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, -v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x11,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, -v7, v8
+// GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, -v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x21,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, -v8
+// GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, -v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x41,0x02,0x03,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v0, -s1, v2, v3 :: v_dual_bitop2_b32 v5, v6, v7
+// GFX1250: v_dual_fma_f32 v0, -s1, v2, v3 :: v_dual_bitop2_b32 v5, v6, v7 ; encoding: [0x01,0x20,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v0, v1, v2 :: v_dual_add_f32 v5, -s6, v7
+// GFX1250: v_dual_add_f32 v0, v1, v2 :: v_dual_add_f32 v5, -s6, v7 ; encoding: [0x01,0x41,0x10,0xcf,0x06,0x10,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v0, -v1, v2 :: v_dual_add_nc_u32 v5, v6, v7
+// GFX1250: v_dual_add_f32 v0, -v1, v2 :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x01,0x01,0x11,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_fmac_f32 v5, -v6, -v7
+// GFX1250: v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x6a,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v0, -v1, -v2 :: v_dual_ashrrev_i32 v5, v6, v7
+// GFX1250: v_dual_fmac_f32 v0, -v1, -v2 :: v_dual_ashrrev_i32 v5, v6, v7 ; encoding: [0x01,0x61,0x01,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v0, v1, -v2 :: v_dual_fmac_f32 v5, -v6, v7
+// GFX1250: v_dual_fmac_f32 v0, v1, -v2 :: v_dual_fmac_f32 v5, -v6, v7 ; encoding: [0x01,0x01,0x00,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, vcc_lo
+// GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, vcc_lo ; encoding: [0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x6a,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v0, -v1, v2 :: v_dual_min_num_f32 v5, v6, v7
+// GFX1250: v_dual_max_num_f32 v0, -v1, v2 :: v_dual_min_num_f32 v5, v6, v7 ; encoding: [0x01,0xb1,0x28,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v0, v1 :: v_dual_max_num_f32 v5, -s6, -v7
+// GFX1250: v_dual_mov_b32 v0, v1 :: v_dual_max_num_f32 v5, -s6, -v7 ; encoding: [0x01,0xa1,0x20,0xcf,0x06,0x30,0x00,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_lshlrev_b32 v5, v6, v7
+// GFX1250: v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_lshlrev_b32 v5, v6, v7 ; encoding: [0x01,0x11,0x1d,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_mul_f32 v5, v6, -v7
+// GFX1250: v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_mul_f32 v5, v6, -v7 ; encoding: [0x01,0x31,0x1c,0xcf,0x06,0x25,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7
+// GFX1250: v_dual_mul_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7 ; encoding: [0x01,0x51,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7
+// GFX1250: v_dual_mul_f32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7 ; encoding: [0x01,0x71,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7
+// GFX1250: v_dual_sub_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7 ; encoding: [0x01,0x71,0x15,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v0, -v1, v2 :: v_dual_mul_f32 v5, -v6, -v7
+// GFX1250: v_dual_sub_f32 v0, -v1, v2 :: v_dual_mul_f32 v5, -v6, -v7 ; encoding: [0x01,0x31,0x14,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v0, v1, -v2 :: v_dual_min_i32 v5, v6, v7
+// GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_min_i32 v5, v6, v7 ; encoding: [0x01,0x81,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v0, v1, -v2 :: v_dual_sub_f32 v5, -v6, v7
+// GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_sub_f32 v5, -v6, v7 ; encoding: [0x01,0x51,0x18,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_mov_b32 v5, v6
+// GFX1250: v_dual_add_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_mov_b32 v5, v6 ; encoding: [0x08,0x81,0x84,0xcf,0x06,0x03,0x04,0x00,0x00,0x00,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[0:1], -s[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, -v7
+// GFX1250: v_dual_add_f64 v[0:1], -s[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, -v7 ; encoding: [0x08,0x60,0x84,0xcf,0x06,0x27,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v0, v1, v2 :: v_dual_fma_f32 v5, -v6, v7, -v8
+// GFX1250: v_dual_add_nc_u32 v0, v1, v2 :: v_dual_fma_f32 v5, -v6, v7, -v8 ; encoding: [0x01,0x31,0x41,0xcf,0x06,0x51,0x02,0x00,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_add_f32 v5, -s6, -v7
+// GFX1250: v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_add_f32 v5, -s6, -v7 ; encoding: [0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x6a,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[0:1], -v[8:9], -v[4:5], -v[10:11] :: v_dual_add_nc_u32 v5, v6, v7
+// GFX1250: v_dual_fma_f64 v[0:1], -v[8:9], -v[4:5], -v[10:11] :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x08,0x01,0x81,0xcf,0x06,0x0f,0x04,0x0a,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[0:1], v[8:9], v[4:5], -v[10:11] :: v_dual_fma_f32 v5, v6, v7, -v8
+// GFX1250: v_dual_fma_f64 v[0:1], v[8:9], v[4:5], -v[10:11] :: v_dual_fma_f32 v5, v6, v7, -v8 ; encoding: [0x08,0x31,0x81,0xcf,0x06,0x49,0x04,0x0a,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v0, v1, v2 :: v_dual_min_num_f32 v5, -s6, -v7
+// GFX1250: v_dual_lshlrev_b32 v0, v1, v2 :: v_dual_min_num_f32 v5, -s6, -v7 ; encoding: [0x01,0xb1,0x44,0xcf,0x06,0x30,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v0, v1, -v2 :: v_dual_mov_b32 v5, v6
+// GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_mov_b32 v5, v6 ; encoding: [0x01,0x81,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x00,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v0, -v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7
+// GFX1250: v_dual_max_num_f32 v0, -v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7 ; encoding: [0x01,0x71,0x28,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_mul_f32 v5, -v6, v7
+// GFX1250: v_dual_max_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_mul_f32 v5, -v6, v7 ; encoding: [0x08,0x31,0x8c,0xcf,0x06,0x17,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_sub_nc_u32 v5, v6, v7
+// GFX1250: v_dual_max_num_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_sub_nc_u32 v5, v6, v7 ; encoding: [0x08,0x41,0x8d,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v0, v1, -v2 :: v_dual_add_nc_u32 v5, v6, v7
+// GFX1250: v_dual_min_num_f32 v0, v1, -v2 :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x01,0x01,0x2d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v0, -v1, v2 :: v_dual_sub_f32 v5, -v6, -v7
+// GFX1250: v_dual_min_num_f32 v0, -v1, v2 :: v_dual_sub_f32 v5, -v6, -v7 ; encoding: [0x01,0x51,0x2c,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[0:1], -s[8:9], v[4:5] :: v_dual_ashrrev_i32 v5, v6, v7
+// GFX1250: v_dual_min_num_f64 v[0:1], -s[8:9], v[4:5] :: v_dual_ashrrev_i32 v5, v6, v7 ; encoding: [0x08,0x60,0x91,0xcf,0x06,0x03,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, v7
+// GFX1250: v_dual_min_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, v7 ; encoding: [0x08,0x61,0x90,0xcf,0x06,0x07,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:1
+// GFX1250: v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:1 ; encoding: [0x01,0x21,0x1d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x01,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_fma_f32 v5, -s6, -v7, -v8
+// GFX1250: v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_fma_f32 v5, -s6, -v7, -v8 ; encoding: [0x01,0x31,0x1d,0xcf,0x06,0x72,0x02,0x00,0x00,0x07,0x08,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v0, -v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:100
+// GFX1250: v_dual_mul_f32 v0, -v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:0x64 ; encoding: [0x01,0x21,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x64,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v0, v1, v2 :: v_dual_fmac_f32 v5, -v6, -v7
+// GFX1250: v_dual_mul_f32 v0, v1, v2 :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_add_f32 v5, -v6, v7
+// GFX1250: v_dual_mul_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_add_f32 v5, -v6, v7 ; encoding: [0x08,0x41,0x88,0xcf,0x06,0x13,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_lshlrev_b32 v5, v6, v7
+// GFX1250: v_dual_mul_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_lshlrev_b32 v5, v6, v7 ; encoding: [0x08,0x11,0x89,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7
+// GFX1250: v_dual_sub_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7 ; encoding: [0x01,0x51,0x15,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_f32 v0, v1, v2 :: v_dual_min_num_f32 v5, v6, -v7
+// GFX1250: v_dual_sub_f32 v0, v1, v2 :: v_dual_min_num_f32 v5, v6, -v7 ; encoding: [0x01,0xb1,0x14,0xcf,0x06,0x21,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, v6, v7
+// GFX1250: v_dual_sub_nc_u32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, v6, v7 ; encoding: [0x01,0x71,0x50,0xcf,0x06,0x01,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7
+// GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7 ; encoding: [0x01,0x71,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v0, -s1, -v2 :: v_dual_mul_f32 v5, -s6, -v7
+// GFX1250: v_dual_subrev_f32 v0, -s1, -v2 :: v_dual_mul_f32 v5, -s6, -v7 ; encoding: [0x01,0x30,0x18,0xcf,0x06,0x36,0x02,0x00,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_nc_u32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s97 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s97 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x61,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fmac_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshlrev_b32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_num_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_num_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v255, s96 :: v_dual_mov_b32 v7, v1
+// GFX1250: v_dual_cndmask_b32 v255, v4, v255, s96 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x60,0xff,0x00,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_dx9_zero_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_subrev_f32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fma_f32 v7, v1, v3, v4
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x04,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1 ; encoding: [0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x01,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, s96
+// GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, s96 ; encoding: [0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_nc_u32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshrrev_b32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_ashrrev_i32 v7, v1, v3
+// GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, s96
+// GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, s96
+// GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96
+// GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96
+// GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96
+// GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96
+// GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_fmac_f32 v5, -v6, -v7
+// GFX1250: v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x60,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, s96
+// GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, s96 ; encoding: [0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x60,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+
+v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_add_f32 v5, -s6, -v7
+// GFX1250: v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_add_f32 v5, -s6, -v7 ; encoding: [0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x60,0x00,0x07,0x00,0x05]
+// W64-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s
new file mode 100644
index 0000000000000..81b79cb8c28da
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_errs.s
@@ -0,0 +1,326 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 %s 2>&1 | FileCheck %s -check-prefix=GFX12 --implicit-check-not=error: --strict-whitespace
+
+//===----------------------------------------------------------------------===//
+// A VOPD instruction can use only one literal.
+//===----------------------------------------------------------------------===//
+
+v_dual_mul_f32      v11, 0x24681357, v2          ::  v_dual_mul_f32      v10, 0xbabe, v5
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_mul_f32      v11, 0x24681357, v2          ::  v_dual_mul_f32      v10, 0xbabe, v5
+// GFX12-NEXT:{{^}}                                                                              ^
+
+//===----------------------------------------------------------------------===//
+// When 2 different literals are specified, show the location
+// of the last literal which is not a KImm, if any.
+//===----------------------------------------------------------------------===//
+
+v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32  v247, 0xbabe, v99
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32  v247, 0xbabe, v99
+// GFX12-NEXT:{{^}}                                                                               ^
+
+v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xbabe
+// GFX12-NEXT:{{^}}                        ^
+
+v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0xbabe, v1, 0xbabe
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, 0xbabe, v1, 0xbabe
+// GFX12-NEXT:{{^}}                                                                              ^
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
+// GFX12-NEXT:{{^}}                                                                                   ^
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
+// GFX12-NEXT:{{^}}                          ^
+
+//===----------------------------------------------------------------------===//
+// Check that assembler detects a different literal regardless of its location.
+//===----------------------------------------------------------------------===//
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0x1234, v162
+// GFX12-NEXT:{{^}}                                                                                   ^
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0x1234, 0xdeadbeef, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0x1234, 0xdeadbeef, v162
+// GFX12-NEXT:{{^}}                                                                                   ^
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0xdeadbeef, 0x1234, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
+// GFX12-NEXT:{{^}}                                                                                   ^
+
+v_dual_fmamk_f32    v122, 0x1234, 0xdeadbeef, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, 0x1234, 0xdeadbeef, v161     ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
+// GFX12-NEXT:{{^}}                                                                                   ^
+
+//===----------------------------------------------------------------------===//
+// When 2 different literals are specified and all literals are KImm,
+// show the location of the last KImm literal.
+//===----------------------------------------------------------------------===//
+
+v_dual_fmamk_f32    v122, s0, 0xdeadbeef, v161   ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, s0, 0xdeadbeef, v161   ::  v_dual_fmamk_f32  v123, s0, 0x1234, v162
+// GFX12-NEXT:{{^}}                                                                                 ^
+
+//===----------------------------------------------------------------------===//
+// A VOPD instruction cannot use more than 2 scalar operands
+//===----------------------------------------------------------------------===//
+
+// 2 different SGPRs + LITERAL
+
+v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_max_i32       v247, s75, v98
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_max_i32       v247, s75, v98
+// GFX12-NEXT:{{^}}                                                                                ^
+
+v_dual_mov_b32      v247, s73                    ::  v_dual_fmaak_f32     v122, s74, v161, 2.741
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_mov_b32      v247, s73                    ::  v_dual_fmaak_f32     v122, s74, v161, 2.741
+// GFX12-NEXT:{{^}}                                                                                ^
+
+v_dual_fmamk_f32    v122, s0, 0xbabe, v161       ::  v_dual_fmamk_f32     v123, s1, 0xbabe, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_fmamk_f32    v122, s0, 0xbabe, v161       ::  v_dual_fmamk_f32     v123, s1, 0xbabe, v162
+// GFX12-NEXT:{{^}}                                                                                ^
+
+// 2 different SGPRs + VCC
+
+v_dual_add_f32      v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s2, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_add_f32      v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s2, v3
+// GFX12-NEXT:{{^}}                                                                              ^
+
+v_dual_cndmask_b32   v6, s1, v3                  ::  v_dual_add_f32       v255, s2, v2
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32   v6, s1, v3                  ::  v_dual_add_f32       v255, s2, v2
+// GFX12-NEXT:{{^}}                                                                                ^
+
+v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s2, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s2, v3
+// GFX12-NEXT:{{^}}                                                                              ^
+
+v_dual_cndmask_b32 v1, s2, v3, vcc_lo :: v_dual_cndmask_b32 v2, s3, v4, vcc_lo
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32 v1, s2, v3, vcc_lo :: v_dual_cndmask_b32 v2, s3, v4, vcc_lo
+// GFX12-NEXT:{{^}}                                                                        ^
+
+// SGPR + LITERAL + VCC
+
+v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_mov_b32       v254, 0xbabe
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_mov_b32       v254, 0xbabe
+// GFX12-NEXT:{{^}}                                                                                ^
+
+v_dual_cndmask_b32  v255, 0xbabe, v2             ::  v_dual_mov_b32       v254, s1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32  v255, 0xbabe, v2             ::  v_dual_mov_b32       v254, s1
+// GFX12-NEXT:{{^}}                                                                                ^
+
+v_dual_cndmask_b32  v255, s3, v2                 ::  v_dual_fmamk_f32     v254, v1, 0xbabe, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32  v255, s3, v2                 ::  v_dual_fmamk_f32     v254, v1, 0xbabe, v162
+// GFX12-NEXT:{{^}}                          ^
+
+v_dual_cndmask_b32  v255, v1, v2                 ::  v_dual_fmamk_f32     v254, s3, 0xbabe, v162
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32  v255, v1, v2                 ::  v_dual_fmamk_f32     v254, s3, 0xbabe, v162
+// GFX12-NEXT:{{^}}                                                                                ^
+
+// SGPR + VCC + VCC_LO
+// This is a special case because implicit VCC operand has 64 bit size.
+// SP3 does not accept this instruction as well.
+
+v_dual_add_f32      v255, vcc_lo, v2             ::  v_dual_cndmask_b32   v6, s1, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
+// GFX12-NEXT:{{^}}v_dual_add_f32      v255, vcc_lo, v2             ::  v_dual_cndmask_b32   v6, s1, v3
+// GFX12-NEXT:{{^}}                                                                              ^
+
+// FIXME: Error should be 'unsupported instruction'
+v_dual_add_f32 v255, v4, v2 :: v_dual_and_b32 v6, v1, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// GFX12-NEXT:{{^}}v_dual_add_f32 v255, v4, v2 :: v_dual_and_b32 v6, v1, v3
+// GFX12-NEXT:{{^}}^
+
+v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmaak_f32 v7, v101, v3, 0xaf123456
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: one dst register must be even and the other odd
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmaak_f32 v7, v101, v3, 0xaf123456
+// GFX12-NEXT:{{^}}                                                    ^
+
+v_dual_add_f32 v2, v2, v5 :: v_dual_mul_f32 v4, 130, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: one dst register must be even and the other odd
+// GFX12-NEXT:{{^}}v_dual_add_f32 v2, v2, v5 :: v_dual_mul_f32 v4, 130, v6
+// GFX12-NEXT:{{^}}                                            ^
+
+// Even though it could be represented as VOPD3, fmac reads its dst and bank constraints still apply to src2.
+v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: one dst register must be even and the other odd
+// GFX12-NEXT:{{^}}v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3
+// GFX12-NEXT:{{^}}                                                  ^
+
+// Destination should be distinct even if not checked for parity in VOPD3
+v_dual_fmac_f32 v7, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dst registers must be distinct
+// GFX12-NEXT:{{^}}v_dual_fmac_f32 v7, v4, v2 :: v_dual_fmac_f32 v7, v1, v3
+// GFX12-NEXT:{{^}}                                              ^
+
+v_dual_add_f32 v7, v4, v2 :: v_dual_add_f32 v7, v5, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dst registers must be distinct
+// GFX12-NEXT:{{^}}v_dual_add_f32 v7, v4, v2 :: v_dual_add_f32 v7, v5, v3
+// GFX12-NEXT:{{^}}                                            ^
+
+//===----------------------------------------------------------------------===//
+// A 64-bit operand shall not have bank conflicts with both subregs.
+// There is also NO exception that a 64 bit operand can start whith the same
+// register as 32 bit.
+//===----------------------------------------------------------------------===//
+v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v8, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src0 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v8, v6
+// GFX12-NEXT:{{^}}                                                                ^
+
+v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v9, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src0 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v9, v6
+// GFX12-NEXT:{{^}}                                                                ^
+
+v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v4, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src0 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_add_f64 v[2:3], v[4:5], v[8:9] :: v_dual_ashrrev_i32 v5, v4, v6
+// GFX12-NEXT:{{^}}                                                                ^
+
+v_dual_add_f64 v[2:3], 1, v[8:9] :: v_dual_ashrrev_i32 v3, v7, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dst registers must be distinct
+// GFX12-NEXT:{{^}}v_dual_add_f64 v[2:3], 1, v[8:9] :: v_dual_ashrrev_i32 v3, v7, v6
+// GFX12-NEXT:{{^}}                                                       ^
+
+//===----------------------------------------------------------------------===//
+// Literals not supported by VOPD3. Inline literals can only be encoded for
+// src0, but not for vsrc1 or vsrc2.
+//===----------------------------------------------------------------------===//
+v_dual_add_f64 v[2:3], 100.0, v[8:9] :: v_dual_ashrrev_i32 v4, v7, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_dual_add_f64 v[2:3], 100.0, v[8:9] :: v_dual_ashrrev_i32 v4, v7, v6
+// GFX12-NEXT:{{^}}                       ^
+
+v_dual_fma_f32 v255, s105, v2, v255 :: v_dual_fma_f32 v7, 1, 0, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, v2, v255 :: v_dual_fma_f32 v7, 1, 0, v8
+// GFX12-NEXT:{{^}}                                                             ^
+
+v_dual_fma_f32 v255, s105, v2, v255 :: v_dual_fma_f32 v7, 1, v0, 0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, v2, v255 :: v_dual_fma_f32 v7, 1, v0, 0
+// GFX12-NEXT:{{^}}                                                                 ^
+
+//===----------------------------------------------------------------------===//
+// Check that we properly detect bank conflicts if instruction is derived from
+// VOP3.
+//===----------------------------------------------------------------------===//
+v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v8, v7, v6
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src0 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v8, v7, v6
+// GFX12-NEXT:{{^}}                                                    ^
+
+v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v5, v6, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src1 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v5, v6, v8
+// GFX12-NEXT:{{^}}                                                        ^
+
+v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v5, v8, v7
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src2 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fma_f32 v3, v5, v8, v7
+// GFX12-NEXT:{{^}}                                                            ^
+
+v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fmac_f32 v7, v5, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src2 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v1, v4, v2, v3 :: v_dual_fmac_f32 v7, v5, v8
+// GFX12-NEXT:{{^}}                           ^
+
+v_dual_fmac_f32 v7, v5, v8 :: v_dual_fma_f32 v1, v4, v2, v3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: src2 operands must use different VGPR banks
+// GFX12-NEXT:{{^}}v_dual_fmac_f32 v7, v5, v8 :: v_dual_fma_f32 v1, v4, v2, v3
+// GFX12-NEXT:{{^}}                                                         ^
+
+//===----------------------------------------------------------------------===//
+// ABS modifiers are not supported
+//===----------------------------------------------------------------------===//
+v_dual_fma_f32 v255, |s105|, v0, v1 :: v_dual_add_nc_u32 v7, s1, v0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, |s105|, v0, v1 :: v_dual_add_nc_u32 v7, s1, v0
+// GFX12-NEXT:{{^}}                      ^
+
+v_dual_fma_f32 v255, s105, abs(v0), v1 :: v_dual_fma_f32 v7, s1, v0, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, abs(v0), v1 :: v_dual_fma_f32 v7, s1, v0, v8
+// GFX12-NEXT:{{^}}                               ^
+
+v_dual_fma_f32 v255, s105, v0, |v1| :: v_dual_fma_f32 v7, s1, v0, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, v0, |v1| :: v_dual_fma_f32 v7, s1, v0, v8
+// GFX12-NEXT:{{^}}                                ^
+
+v_dual_add_nc_u32 v255, s105, v0 :: v_dual_fma_f32 v7, |1|, v0, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_add_nc_u32 v255, s105, v0 :: v_dual_fma_f32 v7, |1|, v0, v8
+// GFX12-NEXT:{{^}}                                                        ^
+
+v_dual_fma_f32 v255, s105, v0, v1 :: v_dual_fma_f32 v7, s1, -|v0|, v8
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, v0, v1 :: v_dual_fma_f32 v7, s1, -|v0|, v8
+// GFX12-NEXT:{{^}}                                                              ^
+
+v_dual_fma_f32 v255, s105, v0, v1 :: v_dual_fma_f32 v7, s1, v0, -abs(v8)
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, s105, v0, v1 :: v_dual_fma_f32 v7, s1, v0, -abs(v8)
+// GFX12-NEXT:{{^}}                                                                     ^
+
+v_dual_mul_f64 v[6:7], -|v[2:3]|, v[4:5] :: v_dual_fma_f32 v255, -s105, v2, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: ABS not allowed in VOPD3 instructions
+// GFX12-NEXT:{{^}}v_dual_mul_f64 v[6:7], -|v[2:3]|, v[4:5] :: v_dual_fma_f32 v255, -s105, v2, v1
+// GFX12-NEXT:{{^}}                         ^
+
+//===----------------------------------------------------------------------===//
+// No modifiers on non-fp part of an instruction
+//===----------------------------------------------------------------------===//
+v_dual_fma_f32 v255, -s105, v0, v1 :: v_dual_lshrrev_b32 v7, -s1, v0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, -s105, v0, v1 :: v_dual_lshrrev_b32 v7, -s1, v0
+// GFX12-NEXT:{{^}}                                                              ^
+
+v_dual_fma_f32 v255, -s105, v0, v1 :: v_dual_max_i32 v7, s1, -v0
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12-NEXT:{{^}}v_dual_fma_f32 v255, -s105, v0, v1 :: v_dual_max_i32 v7, s1, -v0
+// GFX12-NEXT:{{^}}                                                              ^
+
+v_dual_add_nc_u32 v7, -s1, v0 :: v_dual_fma_f32 v255, -s105, v0, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-NEXT:{{^}}v_dual_add_nc_u32 v7, -s1, v0 :: v_dual_fma_f32 v255, -s105, v0, v1
+// GFX12-NEXT:{{^}}                      ^
+
+v_dual_sub_nc_u32 v7, s1, -v0 :: v_dual_fma_f32 v255, -s105, v0, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-NEXT:{{^}}v_dual_sub_nc_u32 v7, s1, -v0 :: v_dual_fma_f32 v255, -s105, v0, v1
+// GFX12-NEXT:{{^}}                          ^
+
+v_dual_cndmask_b32 v28, sext(v15), v15, s46 :: v_dual_cndmask_b32 v29, v13, -v13, s46
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32 v28, sext(v15), v15, s46 :: v_dual_cndmask_b32 v29, v13, -v13, s46
+// GFX12-NEXT:{{^}}                        ^
+
+
+v_dual_cndmask_b32 v28, -v15, v15, s46 :: v_dual_cndmask_b32 v29, sext(v13), -v13, s46
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+// GFX12-NEXT:{{^}}v_dual_cndmask_b32 v28, -v15, v15, s46 :: v_dual_cndmask_b32 v29, sext(v13), -v13, s46
+// GFX12-NEXT:{{^}}                                                                  ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_features.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_features.s
new file mode 100644
index 0000000000000..cdd9f301e2506
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vopd_features.s
@@ -0,0 +1,109 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck -check-prefix=GFX12 %s
+
+//===----------------------------------------------------------------------===//
+// A VOPD instruction can use one or more literals,
+// provided that they are identical.
+//===----------------------------------------------------------------------===//
+
+// LITERAL
+
+v_dual_mul_f32      v11, v1, v2                  ::  v_dual_mul_f32      v10, 0x24681357, v5
+// GFX12: encoding: [0x01,0x05,0xc6,0xc8,0xff,0x0a,0x0a,0x0b,0x57,0x13,0x68,0x24]
+
+// LITERAL*2
+
+v_dual_mul_f32      v11, 0x24681357, v2          ::  v_dual_mul_f32      v10, 0x24681357, v5
+// GFX12: encoding: [0xff,0x04,0xc6,0xc8,0xff,0x0a,0x0a,0x0b,0x57,0x13,0x68,0x24]
+
+// LITERAL + KIMM
+
+v_dual_add_f32      v5, 0xaf123456, v2           ::  v_dual_fmaak_f32     v6, v3, v1, 0xaf123456 ;
+// GFX12: encoding: [0xff,0x04,0x02,0xc9,0x03,0x03,0x06,0x05,0x56,0x34,0x12,0xaf]
+
+// KIMM + LITERAL
+
+v_dual_fmamk_f32    v122, v74, 0xa0172923, v161  ::  v_dual_lshlrev_b32   v247, 0xa0172923, v99
+// GFX12: encoding: [0x4a,0x43,0xa3,0xc8,0xff,0xc6,0xf6,0x7a,0x23,0x29,0x17,0xa0]
+
+// KIMM*2
+
+v_dual_fmamk_f32    v122, 0xdeadbeef, 0xdeadbeef, v161 ::  v_dual_fmamk_f32  v123, 0xdeadbeef, 0xdeadbeef, v162
+// GFX12: encoding: [0xff,0x42,0x85,0xc8,0xff,0x44,0x7b,0x7a,0xef,0xbe,0xad,0xde]
+
+//===----------------------------------------------------------------------===//
+// A VOPD instruction can use 2 scalar operands,
+// but implicit VCC must be counted in.
+//===----------------------------------------------------------------------===//
+
+// 2 different SGPRs
+
+v_dual_mul_f32      v0, s1, v2                   ::  v_dual_mul_f32       v3, s4, v5
+// GFX12: encoding: [0x01,0x04,0xc6,0xc8,0x04,0x0a,0x02,0x00]
+
+// SGPR + LITERAL
+
+v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_max_i32       v247, v160, v98
+// GFX12: encoding: [0x4a,0x42,0x6f,0xc8,0xa0,0xc5,0xf6,0x7a,0x8b,0x6c,0x2f,0x40]
+
+v_dual_mov_b32      v247, v160                   ::  v_dual_fmaak_f32     v122, s74, v161, 2.741
+// GFX12: encoding: [0xa0,0x01,0x02,0xca,0x4a,0x42,0x7b,0xf7,0x8b,0x6c,0x2f,0x40]
+
+// SGPR*2 + LITERAL
+
+v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_max_i32       v247, s74, v98
+// GFX12: encoding: [0x4a,0x42,0x6f,0xc8,0x4a,0xc4,0xf6,0x7a,0x8b,0x6c,0x2f,0x40]
+
+// SGPR + LITERAL*2
+
+v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_fmamk_f32     v3, v6, 2.741, v1
+// GFX12: encoding: [0x4a,0x42,0x45,0xc8,0x06,0x03,0x02,0x7a,0x8b,0x6c,0x2f,0x40]
+
+// SGPR*2 + LITERAL*2
+
+v_dual_fmaak_f32    v122, s74, v161, 2.741       ::  v_dual_fmamk_f32     v3, s74, 2.741, v1
+// GFX12: encoding: [0x4a,0x42,0x45,0xc8,0x4a,0x02,0x02,0x7a,0x8b,0x6c,0x2f,0x40]
+
+// LITERAL + VCC
+
+v_dual_fmaak_f32    v122, v0, v161, 2.741       ::  v_dual_cndmask_b32   v1, v2, v3
+// GFX12: encoding: [0x00,0x43,0x53,0xc8,0x02,0x07,0x00,0x7a,0x8b,0x6c,0x2f,0x40]
+
+// LITERAL*2 + VCC
+
+v_dual_fmaak_f32    v122, v0, v161, 2.741       ::  v_dual_cndmask_b32   v1, 2.741, v3
+// GFX12: encoding: [0x00,0x43,0x53,0xc8,0xff,0x06,0x00,0x7a,0x8b,0x6c,0x2f,0x40]
+
+// LITERAL*2 + VCC*2
+
+v_dual_cndmask_b32  v255, 0xbabe, v2             ::  v_dual_cndmask_b32   v6, 0xbabe, v3
+// GFX12: encoding: [0xff,0x04,0x52,0xca,0xff,0x06,0x06,0xff,0xbe,0xba,0x00,0x00]
+
+// SGPR*2 + VCC
+
+v_dual_add_f32      v255, s105, v2               ::  v_dual_cndmask_b32   v6, s105, v3
+// GFX12: encoding: [0x69,0x04,0x12,0xc9,0x69,0x06,0x06,0xff]
+
+// SGPR*2 + VCC*2
+
+v_dual_cndmask_b32  v255, s1, v2                 ::  v_dual_cndmask_b32   v6, s1, v3
+// GFX12: encoding: [0x01,0x04,0x52,0xca,0x01,0x06,0x06,0xff]
+
+// VCC*2
+
+v_dual_add_f32      v255, vcc_lo, v2             ::  v_dual_cndmask_b32   v6, v1, v3
+// GFX12: encoding: [0x6a,0x04,0x12,0xc9,0x01,0x07,0x06,0xff]
+
+//===----------------------------------------------------------------------===//
+// A VOPD OpY mov_b32 instruction uses SRC2 source-cache if OpX is also mov_b32
+//===----------------------------------------------------------------------===//
+
+v_dual_mov_b32      v2, v5                       ::  v_dual_mov_b32       v3, v1
+// GFX12: encoding: [0x05,0x01,0x10,0xca,0x01,0x01,0x02,0x02]
+
+//===----------------------------------------------------------------------===//
+// SRCX0 and SRCY0 may use the same bank if they are using the same VGPR; same for
+// VSRCX1 and VSRCY1.
+//===----------------------------------------------------------------------===//
+
+v_dual_add_f32 v2, v2, v5 :: v_dual_mul_f32 v3, v2, v5
+// GFX12: encoding: [0x02,0x0b,0x06,0xc9,0x02,0x0b,0x02,0x02]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index ddb6d9520ce1d..e04c6aa930150 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -1,7 +1,42 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
 
+// For v_dual_cndmask_b32 use of the explicit src2 forces VOPD3 form even if it is vcc_lo.
+// If src2 is omitted then it forces VOPD form. As a result a proper form of the instruction
+// has to be used if the other component of the dual instruction cannot be used if that
+// encoding.
+
+v_dual_cndmask_b32 v2, v4, v1 :: v_dual_fma_f32 v7, v1, v2, v3
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid VOPDY instruction
+// GFX1250-ERR: v_dual_cndmask_b32 v2, v4, v1 :: v_dual_fma_f32 v7, v1, v2, v3
+// GFX1250-ERR:                                  ^
+
+v_dual_fma_f32 v7, v1, v2, v3 :: v_dual_cndmask_b32 v2, v4, v1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: too few operands for instruction
+// GFX1250-ERR: v_dual_fma_f32 v7, v1, v2, v3 :: v_dual_cndmask_b32 v2, v4, v1
+// GFX1250-ERR: ^
+
+v_dual_cndmask_b32 v7, v1, v2 :: v_dual_cndmask_b32 v2, v4, v1, vcc_lo
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR: v_dual_cndmask_b32 v7, v1, v2 :: v_dual_cndmask_b32 v2, v4, v1, vcc_lo
+// GFX1250-ERR:                                                                 ^
+
+v_dual_cndmask_b32 v7, v1, v2, vcc_lo :: v_dual_cndmask_b32 v2, v4, v1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: too few operands for instruction
+// GFX1250-ERR: v_dual_cndmask_b32 v7, v1, v2, vcc_lo :: v_dual_cndmask_b32 v2, v4, v1
+// GFX1250-ERR: ^
+
 // Check for unique 64-bit literal
 
+v_mov_b64 v[4:5], v[2:3] quad_perm:[1,1,1,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_mov_b64 v[4:5], v[2:3] quad_perm:[1,1,1,1]
+// GFX1250-ERR:                          ^
+
+v_mov_b64 v[4:5], v[2:3] dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_mov_b64 v[4:5], v[2:3] dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250-ERR:                          ^
+
 s_andn2_b64 s[2:3], 0x10abcdef12345678, 0xabcdef12345678
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
 // GFX1250-ERR: s_andn2_b64 s[2:3], 0x10abcdef12345678, 0xabcdef12345678
@@ -61,3 +96,43 @@ v_ceil_f64 v[2:3], lit64(123
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected closing parentheses
 // GFX1250-ERR: v_ceil_f64 v[2:3], lit64(123
 // GFX1250-ERR:                             ^
+
+v_fmaak_f64 v[4:5], lit(lit64(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_fmaak_f64 v[4:5], lit(lit64(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR:                              ^
+
+v_fmaak_f64 v[4:5], lit64(lit64(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_fmaak_f64 v[4:5], lit64(lit64(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR:                                ^
+
+v_fmaak_f64 v[4:5], lit64(lit(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR: v_fmaak_f64 v[4:5], lit64(lit(0x7e8)), v[8:9], lit64(0x7e8)
+// GFX1250-ERR:                              ^
+
+v_fmamk_f64 v[4:5], 123.0, 123.1, v[6:7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX1250-ERR: v_fmamk_f64 v[4:5], 123.0, 123.1, v[6:7]
+// GFX1250-ERR:                     ^
+
+v_fmamk_f64 v[4:5], 0x405ec00000000001, 123.0, v[6:7]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX1250-ERR: v_fmamk_f64 v[4:5], 0x405ec00000000001, 123.0, v[6:7]
+// GFX1250-ERR:                     ^
+
+v_fmaak_f64 v[4:5], 123.1, v[6:7], 123.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX1250-ERR: v_fmaak_f64 v[4:5], 123.1, v[6:7], 123.0
+// GFX1250-ERR:                     ^
+
+v_fmaak_f64 v[4:5], 123.0, v[6:7], 0x405ec00000000001
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX1250-ERR: v_fmaak_f64 v[4:5], 123.0, v[6:7], 0x405ec00000000001
+// GFX1250-ERR:                     ^
+
+v_fmaak_f64 v[4:5], 0x7e8, v[8:9], lit64(0x7e8)
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed
+// GFX1250-ERR: v_fmaak_f64 v[4:5], 0x7e8, v[8:9], lit64(0x7e8)
+// GFX1250-ERR:                     ^
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
index 4b5efd00a7adf..85978b04779d0 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
@@ -88,4 +88,7 @@ v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
 // CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x05]
 
 v_pk_fmac_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x06]
+
+v_pk_fmac_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 // CHECK-MI: [0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x16]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
new file mode 100644
index 0000000000000..89731fcc936e6
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+# GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
+0xc3,0x4e,0x80,0xbe
+
+# GFX1250: s_get_barrier_state s3, -3               ; encoding: [0xc3,0x50,0x83,0xbe]
+0xc3,0x50,0x83,0xbe
+
+# GFX1250: s_get_barrier_state s3, -4               ; encoding: [0xc4,0x50,0x83,0xbe]
+0xc4,0x50,0x83,0xbe
+
+# GFX1250: s_get_barrier_state s3, m0               ; encoding: [0x7d,0x50,0x83,0xbe]
+0x7d,0x50,0x83,0xbe
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index 220f9e5084f0e..e7026df3c0e2b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -1,5 +1,20 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+# GFX1250: s_wait_asynccnt 0x1234 ; encoding: [0x34,0x12,0xca,0xbf]
+0x34,0x12,0xca,0xbf
+
+# GFX1250: s_wait_asynccnt 0xc1d1 ; encoding: [0xd1,0xc1,0xca,0xbf]
+0xd1,0xc1,0xca,0xbf
+
+# GFX1250: s_wait_tensorcnt 0x0 ; encoding: [0x00,0x00,0xcb,0xbf]
+0x00,0x00,0xcb,0xbf
+
+# GFX1250: s_wait_tensorcnt 0x1 ; encoding: [0x01,0x00,0xcb,0xbf]
+0x01,0x00,0xcb,0xbf
+
+# GFX1250: s_wait_tensorcnt 0x3 ; encoding: [0x03,0x00,0xcb,0xbf]
+0x03,0x00,0xcb,0xbf
+
 # GFX1250: s_wait_xcnt 0x0 ; encoding: [0x00,0x00,0xc5,0xbf]
 0x00,0x00,0xc5,0xbf
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
new file mode 100644
index 0000000000000..f0fcddb06599f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x04,0xfc,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[2:3], 0x405ec000 ; encoding: [0xfe,0x04,0xfc,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0xfc,0xfd,0x49,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[254:255], lit64(0x405ec00012345678), v[254:255], lit64(0x405ec00012345678) ; encoding: [0xfe,0xfc,0xfd,0x49,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x0c,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[4:5], lit64(0x405ec66666666666), v[6:7], lit64(0x405ec66666666666) ; encoding: [0xfe,0x0c,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xfe,0x10,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[4:5], lit64(0x405ec66666666666), v[8:9], lit64(0x405ec66666666666) ; encoding: [0xfe,0x10,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xf2,0x10,0x08,0x48,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f
+# GFX1250: v_fmaak_f64 v[4:5], 1.0, v[8:9], 0x3ff00000 ; encoding: [0xf2,0x10,0x08,0x48,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
+
+0xfe,0x0c,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_fmaak_f64 v[4:5], lit64(0x7e8), v[6:7], lit64(0x7e8) ; encoding: [0xfe,0x0c,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+
+0xfe,0x10,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_fmaak_f64 v[4:5], lit64(0x7e8), v[8:9], lit64(0x7e8) ; encoding: [0xfe,0x10,0x08,0x48,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+
+0x02,0x05,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[4:5], v[2:3], v[2:3], lit64(0x405ec66666666666) ; encoding: [0x02,0x05,0x08,0x48,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xc1,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], -1, v[8:9], 0x405ec000 ; encoding: [0xc1,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xf0,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], 0.5, v[8:9], 0x405ec000 ; encoding: [0xf0,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x7e,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], exec, v[8:9], 0x405ec000 ; encoding: [0x7e,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x7c,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], null, v[8:9], 0x405ec000 ; encoding: [0x7c,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x02,0x10,0x0c,0x48,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], s[2:3], v[8:9], lit64(0x405ec00012345678) ; encoding: [0x02,0x10,0x0c,0x48,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+
+0xfd,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], src_scc, v[8:9], 0x405ec000 ; encoding: [0xfd,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], v[254:255], v[8:9], 0x405ec000 ; encoding: [0xfe,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x04,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], v[4:5], v[8:9], 0x405ec000 ; encoding: [0x04,0x11,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x6a,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmaak_f64 v[6:7], vcc, v[8:9], 0x405ec000 ; encoding: [0x6a,0x10,0x0c,0x48,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0xfc,0xfd,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[254:255], 0x405ec000, 0x405ec000, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x04,0xfc,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[254:255], 0x405ec000, 0x405ec000, v[2:3] ; encoding: [0xfe,0x04,0xfc,0x47,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0xfc,0xfd,0x47,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[254:255], lit64(0x405ec00012345678), lit64(0x405ec00012345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x47,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x0c,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[4:5], lit64(0x405ec66666666666), lit64(0x405ec66666666666), v[6:7] ; encoding: [0xfe,0x0c,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xfe,0x10,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[4:5], lit64(0x405ec66666666666), lit64(0x405ec66666666666), v[8:9] ; encoding: [0xfe,0x10,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xf2,0x0c,0x08,0x46,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f
+# GFX1250: v_fmamk_f64 v[4:5], 1.0, 0x3ff00000, v[6:7] ; encoding: [0xf2,0x0c,0x08,0x46,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
+
+0xfe,0x0c,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[6:7] ; encoding: [0xfe,0x0c,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+
+0xfe,0x10,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_fmamk_f64 v[4:5], lit64(0x7e8), lit64(0x7e8), v[8:9] ; encoding: [0xfe,0x10,0x08,0x46,0xe8,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
+
+0x02,0x0d,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[4:5], v[2:3], lit64(0x405ec66666666666), v[6:7] ; encoding: [0x02,0x0d,0x08,0x46,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+
+0xc1,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], -1, 0x405ec000, v[2:3] ; encoding: [0xc1,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xf0,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], 0.5, 0x405ec000, v[2:3] ; encoding: [0xf0,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x7e,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], exec, 0x405ec000, v[2:3] ; encoding: [0x7e,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x7c,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], null, 0x405ec000, v[2:3] ; encoding: [0x7c,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x02,0x04,0x0c,0x46,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], s[2:3], lit64(0x405ec00012345678), v[2:3] ; encoding: [0x02,0x04,0x0c,0x46,0x78,0x56,0x34,0x12,0x00,0xc0,0x5e,0x40]
+
+0xfd,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], src_scc, 0x405ec000, v[2:3] ; encoding: [0xfd,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0xfe,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], v[254:255], 0x405ec000, v[2:3] ; encoding: [0xfe,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
+
+0x6a,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
+# GFX1250: v_fmamk_f64 v[6:7], vcc, 0x405ec000, v[2:3] ; encoding: [0x6a,0x04,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd.txt
new file mode 100644
index 0000000000000..119f80ab8bd86
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd.txt
@@ -0,0 +1,12205 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck %s --check-prefix=GFX1250
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x08,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x08,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x20,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x20,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2c,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2c,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x12,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x12,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x02,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x02,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x00,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x00,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x04,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x04,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x22,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x22,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2a,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2a,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2e,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2e,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x14,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x14,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x30,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x30,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x16,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x16,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x10,0xc9,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x10,0xc9,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0e,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0e,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x06,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x06,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0a,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0a,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x28,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x28,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0c,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0c,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x08,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x08,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x20,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x20,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2c,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2c,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x12,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x12,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x02,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x02,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x00,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x00,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x04,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x04,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x22,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x22,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2a,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2a,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2e,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2e,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x14,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x14,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x30,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x30,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x16,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x16,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x10,0xc9,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x10,0xc9,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0e,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0e,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x06,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x06,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0a,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0a,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x28,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x28,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0c,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0c,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x08,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x08,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x20,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x20,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x12,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x12,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x02,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x02,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x00,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x00,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x22,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x22,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x2a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x14,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x14,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x30,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x30,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x16,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x16,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x06,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x06,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x28,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x28,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x05,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x05,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x11,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x11,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x08,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x08,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x20,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x20,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2c,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2c,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x12,0xc9,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x12,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x02,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x02,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x00,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x00,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x22,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x22,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2a,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2a,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2e,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2e,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x14,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x14,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x30,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x30,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x16,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x16,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0e,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0e,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x06,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x06,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0a,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0a,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x28,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x28,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0c,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0c,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x05,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x05,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x11,0xc9,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0x11,0xc9,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x08,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x08,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x20,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x20,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2c,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2c,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x12,0xc9,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x12,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x02,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x02,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x00,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x00,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x22,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x22,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2a,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2a,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2e,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2e,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x14,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x14,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x30,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x30,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x16,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x16,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0e,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0e,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x06,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x06,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0a,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0a,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x28,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x28,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0c,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0c,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x05,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x05,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x11,0xc9,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0x11,0xc9,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x08,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x08,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x20,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x20,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2c,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2c,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x12,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x12,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x02,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x02,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x00,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x00,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x22,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x22,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2a,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2a,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2e,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2e,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x14,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x14,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x30,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x30,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x16,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x16,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0e,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0e,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x06,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x06,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0a,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0a,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x28,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x28,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0c,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0c,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x05,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x05,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x11,0xc9,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x11,0xc9,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x08,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x08,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x20,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x20,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2c,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2c,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x12,0xc9,0x01,0x06,0x06,0xff]
+0x01,0x04,0x12,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x02,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x02,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x00,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x00,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x22,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x22,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x2a,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2a,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2e,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2e,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x14,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x14,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x30,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x30,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x16,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x16,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0e,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0e,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x06,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x06,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0a,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0a,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x28,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x28,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0c,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0c,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x05,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x05,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x11,0xc9,0x69,0x00,0x06,0xff]
+0x01,0xfe,0x11,0xc9,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x08,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x08,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x20,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x20,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2c,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2c,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x12,0xc9,0x69,0x06,0x06,0xff]
+0x69,0x04,0x12,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x02,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x02,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x00,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x00,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x22,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x22,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x2a,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2a,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2e,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2e,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x14,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x14,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x30,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x30,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x16,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x16,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0e,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0e,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x06,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x06,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0a,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0a,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x28,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x28,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0c,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0c,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x05,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x05,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x11,0xc9,0x01,0x00,0x06,0xff]
+0x69,0xfe,0x11,0xc9,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x08,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x08,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x20,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x20,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2c,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2c,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x12,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x12,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x02,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x02,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x00,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x00,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x22,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x22,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2a,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2a,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2e,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2e,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x14,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x14,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x30,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x30,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x16,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x16,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0e,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0e,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x06,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x06,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0a,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0a,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x28,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x28,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0c,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0c,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x05,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x05,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x11,0xc9,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x11,0xc9,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x08,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x08,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x20,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x20,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2c,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2c,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x12,0xc9,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x12,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x02,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x02,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x00,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x00,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x22,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x22,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2a,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2a,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2e,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2e,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x14,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x14,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x30,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x30,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x16,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x16,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0e,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0e,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x06,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x06,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0a,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0a,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x28,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x28,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0c,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0c,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x05,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x05,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x11,0xc9,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0x11,0xc9,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x08,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x08,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x20,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x20,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2c,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2c,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x12,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x12,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x02,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x02,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x00,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x00,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x22,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x22,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x2a,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2a,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2e,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2e,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x14,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x14,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x30,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x30,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x16,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x16,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0e,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0e,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x06,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x06,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0a,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0a,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x28,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x28,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0c,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0c,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x05,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x05,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x11,0xc9,0xff,0x01,0x06,0xff]
+0x01,0xff,0x11,0xc9,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x08,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x08,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x20,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x20,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2c,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2c,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x12,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x12,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x02,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x02,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x00,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x00,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x22,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x22,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x2a,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2a,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2e,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2e,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x14,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x14,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x30,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x30,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x16,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x16,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0e,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0e,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x06,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x06,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0a,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0a,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x28,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x28,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0c,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0c,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x05,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x05,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x11,0xc9,0x03,0x01,0x06,0xff]
+0x02,0xff,0x11,0xc9,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x08,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x08,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x20,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x20,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2c,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2c,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x12,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x12,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x02,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x02,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x00,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x00,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x22,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x22,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x2a,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2a,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2e,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2e,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x14,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x14,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x30,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x30,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x16,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x16,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0e,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0e,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x06,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x06,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0a,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0a,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x28,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x28,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0c,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0c,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x05,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x05,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x11,0xc9,0x02,0x01,0x06,0xff]
+0xff,0xff,0x11,0xc9,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x08,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x08,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x20,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x20,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2c,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2c,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x12,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x12,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x02,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x02,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x00,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x00,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x22,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x22,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x2a,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2a,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2e,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2e,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x14,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x14,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x30,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x30,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x16,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x16,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0e,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0e,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x06,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x06,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0a,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0a,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x28,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x28,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0c,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0c,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x05,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x05,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x11,0xc9,0x04,0x01,0x06,0xff]
+0x03,0xff,0x11,0xc9,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x08,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x20,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x20,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2c,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2c,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x12,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x12,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x02,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x02,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x00,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x00,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x22,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x22,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x2a,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2a,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2e,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2e,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x14,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x14,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x30,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x30,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x16,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x16,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0e,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0e,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x06,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x06,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0a,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0a,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x28,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x28,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0c,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0c,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x05,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x05,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x11,0xc9,0x01,0x01,0x06,0xff]
+0x04,0xff,0x11,0xc9,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x08,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x08,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x20,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x20,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2c,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2c,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x12,0xc9,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x12,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x02,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x02,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x00,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x00,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x22,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x22,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2a,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2a,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2e,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2e,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x14,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x14,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x30,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x30,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x16,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x16,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0e,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0e,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x06,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x06,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0a,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0a,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x28,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x28,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0c,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0c,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x05,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x05,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x11,0xc9,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0x11,0xc9,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x08,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x08,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x20,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x20,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2c,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2c,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x12,0xc9,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x12,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x02,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x02,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x00,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x00,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x22,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x22,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2a,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2a,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2e,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2e,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x14,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x14,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x30,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x30,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x16,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x16,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0e,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0e,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x06,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x06,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0a,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0a,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x28,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x28,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0c,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0c,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x05,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x05,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x11,0xc9,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0x11,0xc9,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x08,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x08,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x20,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x20,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x12,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x12,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x02,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x02,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x00,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x00,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x04,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x04,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x22,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x22,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x14,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x14,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x30,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x30,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x16,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x16,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x10,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x10,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x06,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x06,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x28,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x28,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_add_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x48,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x60,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6c,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x52,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x52,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmaak_f32 v6, 0.5, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xca,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x42,0xca,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x40,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x44,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x62,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6a,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6e,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x54,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x70,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x56,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xca,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x50,0xca,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4e,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x46,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4a,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x68,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4c,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x48,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x60,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6c,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x52,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x52,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, -1, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xca,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x42,0xca,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x40,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x44,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x62,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6a,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6e,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x54,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x70,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x56,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xca,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x50,0xca,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4e,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x46,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4a,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x68,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4c,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x48,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x60,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x52,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x42,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x40,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x62,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x54,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x70,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x56,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x46,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x68,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x45,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x51,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x48,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x48,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x60,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x60,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x6c,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x52,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x52,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x40,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x40,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x62,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x62,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x6a,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x6e,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x54,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x54,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x70,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x70,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x56,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x56,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x4e,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x46,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x46,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x4a,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x68,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x68,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x4c,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x51,0xca,0x7f,0x00,0x06,0xff]
+0x7f,0xfe,0x51,0xca,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x48,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x48,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x60,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x60,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6c,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x6c,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x52,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x52,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x40,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x40,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x62,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x62,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6a,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x6a,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6e,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x6e,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x54,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x54,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x70,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x70,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x56,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x56,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4e,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x4e,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x46,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x46,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4a,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x4a,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x68,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x68,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4c,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x4c,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x51,0xca,0x7e,0x00,0x06,0xff]
+0x7e,0xfe,0x51,0xca,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x48,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x60,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6c,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x52,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x52,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x40,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x62,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6a,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6e,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x54,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x70,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x56,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4e,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x46,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4a,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x68,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4c,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xca,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x51,0xca,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x48,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x48,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x60,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x60,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6c,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x6c,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x52,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x52,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x40,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x40,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x62,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x62,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x6a,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x6a,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6e,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x6e,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x54,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x54,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x70,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x70,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x56,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x56,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4e,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x4e,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x46,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x46,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4a,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x4a,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x68,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x68,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4c,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x4c,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x51,0xca,0x01,0x00,0x06,0xff]
+0x01,0xfe,0x51,0xca,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x48,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x48,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x60,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x60,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6c,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x6c,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x52,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x52,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x40,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x40,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x62,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x62,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x6a,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x6a,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6e,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x6e,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x54,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x54,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x70,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x70,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x56,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x56,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4e,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x4e,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x46,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x46,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4a,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x4a,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x68,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x68,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4c,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x4c,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x51,0xca,0x69,0x00,0x06,0xff]
+0x69,0xfe,0x51,0xca,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x48,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x60,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6c,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x52,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x52,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x40,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x62,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6a,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6e,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x54,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x70,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x56,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4e,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x46,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4a,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x68,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4c,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xca,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x51,0xca,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x48,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x48,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x60,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x60,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6c,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x6c,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x52,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x52,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x40,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x40,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x62,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x62,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6a,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x6a,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6e,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x6e,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x54,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x54,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x70,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x70,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x56,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x56,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4e,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x4e,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x46,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x46,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4a,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x4a,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x68,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x68,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4c,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x4c,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x51,0xca,0x7b,0x00,0x06,0xff]
+0x7b,0xfe,0x51,0xca,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x48,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x60,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6c,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x52,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x42,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x40,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x62,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6a,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6e,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x54,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x70,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x56,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4e,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x46,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4a,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x68,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4c,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x45,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xca,0xff,0x01,0x06,0xff]
+0x01,0xff,0x51,0xca,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x48,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x60,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6c,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x52,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x42,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x40,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x62,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6a,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6e,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x54,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x70,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x56,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4e,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x46,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4a,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x68,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4c,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x45,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xca,0x03,0x01,0x06,0xff]
+0x02,0xff,0x51,0xca,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x48,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x60,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6c,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x52,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x42,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x40,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x62,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6a,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6e,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x54,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x70,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x56,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4e,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x46,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4a,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x68,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4c,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x45,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xca,0x02,0x01,0x06,0xff]
+0xff,0xff,0x51,0xca,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x48,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x60,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6c,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x52,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x42,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x40,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x62,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6a,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6e,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x54,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x70,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x56,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4e,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x46,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4a,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x68,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4c,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x45,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xca,0x04,0x01,0x06,0xff]
+0x03,0xff,0x51,0xca,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x48,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x60,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6c,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x52,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x42,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x40,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x62,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6a,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6e,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x54,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x70,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x56,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4e,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x46,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4a,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x68,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4c,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x45,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xca,0x01,0x01,0x06,0xff]
+0x04,0xff,0x51,0xca,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x48,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x48,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x60,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x60,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6c,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x6c,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x52,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x52,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x40,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x40,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x62,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x62,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6a,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x6a,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6e,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x6e,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x54,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x54,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x70,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x70,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x56,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x56,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4e,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x4e,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x46,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x46,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4a,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x4a,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x68,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x68,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4c,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x4c,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x51,0xca,0x6b,0x00,0x06,0xff]
+0x6b,0xfe,0x51,0xca,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x48,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x48,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x60,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x60,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6c,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x6c,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x52,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x52,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x40,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x40,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x62,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x62,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6a,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x6a,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6e,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x6e,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x54,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x54,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x70,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x70,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x56,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x56,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4e,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x4e,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x46,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x46,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4a,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x4a,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x68,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x68,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4c,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x4c,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x51,0xca,0x6a,0x00,0x06,0xff]
+0x6a,0xfe,0x51,0xca,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x48,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x60,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x52,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x42,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x40,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x44,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x62,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x54,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x70,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x56,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x50,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x46,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x68,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_cndmask_b32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x48,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x60,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x6c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_cndmask_b32 v6, 0.5, v5 ; encoding: [0xc1,0x08,0x52,0xc8,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x52,0xc8,0xf0,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x42,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x40,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x44,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x62,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x6a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x6e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x54,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x70,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x56,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x50,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x4e,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x46,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x4a,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x68,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, -1, v4, 0xaf123456 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x4c,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x48,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x60,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x6c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_cndmask_b32 v6, -1, v2 ; encoding: [0xf0,0x06,0x52,0xc8,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x52,0xc8,0xc1,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x42,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x40,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x44,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x62,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x6a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x6e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x54,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x70,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x56,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x50,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x4e,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x46,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x4a,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x68,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0.5, v3, 0xaf123456 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x4c,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x48,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x60,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x52,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x42,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x40,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x62,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x54,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x70,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x56,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x46,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x68,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v2, 0xaf123456 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x45,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x51,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x48,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x48,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x60,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x60,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x6c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x42,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x42,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x40,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x40,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x62,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x62,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x6a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x6e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x54,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x54,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x70,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x70,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x56,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x56,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x4e,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x46,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x46,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x4a,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x68,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x68,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x4c,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x45,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x45,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x51,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x51,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x48,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x48,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x60,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x60,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x6c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x42,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x42,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x40,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x40,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x62,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x62,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x6a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x6e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x6e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x54,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x54,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x70,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x70,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x56,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x56,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x4e,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x46,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x46,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x4a,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x68,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x68,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x4c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x4c,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x45,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x45,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, exec_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x51,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x51,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x48,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x60,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x6c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x42,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x42,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x40,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x62,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x6a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x6e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x54,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x70,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x56,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x4e,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x46,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x4a,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x68,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v2, 0xaf123456 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x4c,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x45,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x45,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, m0, v255, 0xaf123456 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x51,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x48,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x48,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x60,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x60,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x6c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x42,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x42,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x40,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x40,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x62,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x62,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x6a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x6a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x6e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x6e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x54,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x54,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x01,0x04,0x70,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x70,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x56,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x56,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x4e,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x46,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x46,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x4a,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x01,0x04,0x68,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x68,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x01,0x04,0x4c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x4c,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x45,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x45,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s1, v255, 0xaf123456 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x51,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x51,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x48,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x48,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x60,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x60,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x6c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x42,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x42,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x40,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x40,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x62,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x62,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x6a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x6a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x6e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x6e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x54,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x54,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x69,0x04,0x70,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x70,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x56,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x56,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x4e,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x46,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x46,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x4a,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x69,0x04,0x68,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x68,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v2, 0xaf123456 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x69,0x04,0x4c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x4c,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x45,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x45,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, s105, v255, 0xaf123456 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x51,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x51,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x48,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x60,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x6c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x42,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x42,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x40,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x62,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x6a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x6e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x54,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x70,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x56,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x4e,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x46,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x4a,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x68,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v2, 0xaf123456 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x4c,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x45,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x45,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, src_scc, v255, 0xaf123456 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x51,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x48,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x48,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x60,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x60,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x6c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x42,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x42,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x40,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x40,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x62,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x62,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x6a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x6e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x6e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x54,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x54,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x70,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x70,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x56,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x56,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x4e,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x46,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x46,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x4a,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x68,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x68,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v2, 0xaf123456 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x4c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x4c,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x45,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x45,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, ttmp15, v255, 0xaf123456 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x51,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x51,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x48,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x60,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x6c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x52,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x42,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x40,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x62,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x6a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x6e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x54,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x70,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x56,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x4e,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x46,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x4a,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x68,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x4c,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x45,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v1, v255, 0xaf123456 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x51,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x48,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x60,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x6c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x52,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x42,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x40,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x62,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x6a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x6e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x54,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x70,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x56,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x4e,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x46,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x4a,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x68,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x4c,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x45,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v2, v255, 0xaf123456 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x51,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x48,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x60,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x6c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x52,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x42,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x40,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x62,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x6a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x6e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x54,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x70,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x56,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x4e,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x46,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x4a,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x68,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x4c,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x45,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v255, v255, 0xaf123456 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x51,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x48,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x60,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x6c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x52,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x42,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x40,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x62,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x6a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x6e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x54,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x70,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x56,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x4e,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x46,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x4a,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x68,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x4c,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x45,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v3, v255, 0xaf123456 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x51,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x48,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x60,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x6c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x52,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x42,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x40,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x62,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x6a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x6e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x54,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x70,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x56,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x4e,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x46,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x4a,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x68,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v2, 0xaf123456 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x4c,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x45,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, v4, v255, 0xaf123456 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x51,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x48,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x48,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x60,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x60,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x6c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x42,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x42,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x40,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x40,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x62,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x62,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x6a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x6e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x6e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x54,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x54,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x70,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x70,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x56,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x56,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x4e,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x46,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x46,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x4a,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x68,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x68,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x4c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x4c,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x45,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x45,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_hi, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x51,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x51,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x48,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x48,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x60,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x60,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x6c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x42,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x42,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x40,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x40,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x62,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x62,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x6a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x6e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x6e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x54,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x54,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x70,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x70,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x56,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x56,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x4e,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x46,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x46,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x4a,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x68,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x68,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v2, 0xaf123456 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x4c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x4c,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x45,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x45,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v255, vcc_lo, v255, 0xaf123456 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x51,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x51,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x48,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x60,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x52,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x42,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x40,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x44,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x62,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x54,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x70,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x56,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x50,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x46,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x68,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmaak_f32 v6, null, v5, 0xaf123456 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x08,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x08,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x20,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x20,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2c,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2c,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x12,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x12,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x02,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x02,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x00,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x00,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4 ; encoding: [0xc1,0x08,0x04,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x04,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x22,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x22,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2a,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2a,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x2e,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x2e,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x14,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x14,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x30,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x30,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x16,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x16,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x10,0xc8,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x10,0xc8,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0e,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0e,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x06,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x06,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0a,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0a,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x28,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x28,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x0c,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x0c,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x08,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x08,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x20,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x20,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2c,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2c,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x12,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x12,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x02,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x02,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x00,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x00,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4 ; encoding: [0xf0,0x06,0x04,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x04,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x22,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x22,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2a,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2a,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x2e,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x2e,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x14,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x14,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x30,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x30,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x16,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x16,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x10,0xc8,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x10,0xc8,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0e,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0e,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x06,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x06,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0a,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0a,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x28,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x28,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x0c,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x0c,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x08,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x08,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x20,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x20,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x12,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x12,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x02,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x02,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x00,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x00,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x22,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x22,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x2a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x2e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x2e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x14,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x14,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x30,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x30,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x16,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x16,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0e,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x06,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x06,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0a,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x28,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x28,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x0c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x0c,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 ; encoding: [0xff,0xfe,0x05,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x05,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x11,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x11,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x08,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x08,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x20,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x20,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2c,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2c,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x12,0xc8,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x12,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x02,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x02,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x00,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x00,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x22,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x22,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2a,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2a,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x2e,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x2e,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x14,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x14,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x30,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x30,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x16,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x16,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0e,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0e,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x06,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x06,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0a,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0a,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x28,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x28,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x0c,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x0c,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4 ; encoding: [0x7f,0xfe,0x05,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x05,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x11,0xc8,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0x11,0xc8,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x08,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x08,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x20,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x20,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2c,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2c,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x12,0xc8,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x12,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x02,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x02,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x00,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x00,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x22,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x22,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2a,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2a,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x2e,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x2e,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x14,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x14,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x30,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x30,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x16,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x16,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0e,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0e,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x06,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x06,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0a,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0a,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x28,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x28,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x0c,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x0c,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4 ; encoding: [0x7e,0xfe,0x05,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x05,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x11,0xc8,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0x11,0xc8,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x08,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x08,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x20,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x20,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2c,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2c,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x12,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x12,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x02,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x02,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x00,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x00,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x22,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x22,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2a,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2a,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x2e,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x2e,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x14,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x14,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x30,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x30,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x16,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x16,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0e,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0e,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x06,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x06,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0a,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0a,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x28,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x28,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x0c,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x0c,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4 ; encoding: [0x7d,0xfe,0x05,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x05,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x11,0xc8,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x11,0xc8,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x08,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x08,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x20,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x20,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2c,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2c,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x12,0xc8,0x01,0x06,0x06,0xff]
+0x01,0x04,0x12,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x02,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x02,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x00,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x00,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x22,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x22,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x2a,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2a,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x2e,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x2e,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x14,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x14,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x30,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x30,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x16,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x16,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0e,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0e,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x06,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x06,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0a,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0a,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x28,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x28,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x0c,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0x0c,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4 ; encoding: [0x01,0xfe,0x05,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x05,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x11,0xc8,0x69,0x00,0x06,0xff]
+0x01,0xfe,0x11,0xc8,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x08,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x08,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x20,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x20,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2c,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2c,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x12,0xc8,0x69,0x06,0x06,0xff]
+0x69,0x04,0x12,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x02,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x02,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x00,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x00,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x22,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x22,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x2a,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2a,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x2e,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x2e,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x14,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x14,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x30,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x30,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x16,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x16,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0e,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0e,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x06,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x06,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0a,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0a,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x28,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x28,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x0c,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0x0c,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4 ; encoding: [0x69,0xfe,0x05,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x05,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x11,0xc8,0x01,0x00,0x06,0xff]
+0x69,0xfe,0x11,0xc8,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x08,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x08,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x20,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x20,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2c,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2c,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x12,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x12,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x02,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x02,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x00,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x00,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x22,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x22,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2a,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2a,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x2e,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x2e,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x14,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x14,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x30,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x30,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x16,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x16,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0e,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0e,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x06,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x06,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0a,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0a,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x28,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x28,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x0c,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x0c,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4 ; encoding: [0xfd,0xfe,0x05,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x05,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x11,0xc8,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x11,0xc8,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x08,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x08,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x20,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x20,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2c,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2c,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x12,0xc8,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x12,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x02,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x02,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x00,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x00,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x22,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x22,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2a,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2a,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x2e,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x2e,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x14,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x14,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x30,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x30,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x16,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x16,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0e,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0e,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x06,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x06,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0a,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0a,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x28,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x28,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x0c,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x0c,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4 ; encoding: [0x7b,0xfe,0x05,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x05,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x11,0xc8,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0x11,0xc8,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x08,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x08,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x20,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x20,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2c,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2c,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x12,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x12,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x02,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x02,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x00,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x00,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x22,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x22,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x2a,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2a,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x2e,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x2e,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x14,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x14,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x30,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x30,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x16,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x16,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0e,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0e,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x06,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x06,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0a,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0a,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x28,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x28,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x0c,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0x0c,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4 ; encoding: [0x01,0xff,0x05,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x05,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x11,0xc8,0xff,0x01,0x06,0xff]
+0x01,0xff,0x11,0xc8,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x08,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x08,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x20,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x20,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2c,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2c,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x12,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x12,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x02,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x02,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x00,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x00,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x22,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x22,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x2a,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2a,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x2e,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x2e,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x14,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x14,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x30,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x30,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x16,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x16,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0e,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0e,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x06,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x06,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0a,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0a,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x28,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x28,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x0c,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0x0c,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4 ; encoding: [0x02,0xff,0x05,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x05,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x11,0xc8,0x03,0x01,0x06,0xff]
+0x02,0xff,0x11,0xc8,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x08,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x08,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x20,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x20,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2c,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2c,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x12,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x12,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x02,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x02,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x00,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x00,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x22,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x22,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x2a,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2a,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x2e,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x2e,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x14,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x14,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x30,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x30,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x16,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x16,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0e,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0e,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x06,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x06,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0a,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0a,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x28,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x28,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x0c,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0x0c,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4 ; encoding: [0xff,0xff,0x05,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x05,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x11,0xc8,0x02,0x01,0x06,0xff]
+0xff,0xff,0x11,0xc8,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x08,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x08,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x20,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x20,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2c,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2c,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x12,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x12,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x02,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x02,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x00,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x00,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x22,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x22,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x2a,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2a,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x2e,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x2e,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x14,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x14,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x30,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x30,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x16,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x16,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0e,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0e,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x06,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x06,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0a,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0a,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x28,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x28,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x0c,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0x0c,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4 ; encoding: [0x03,0xff,0x05,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x05,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x11,0xc8,0x04,0x01,0x06,0xff]
+0x03,0xff,0x11,0xc8,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x08,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x08,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x20,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x20,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2c,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2c,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x12,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x12,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x02,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x02,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x00,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x00,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x22,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x22,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x2a,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2a,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x2e,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x2e,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x14,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x14,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x30,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x30,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x16,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x16,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0e,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0e,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x06,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x06,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0a,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0a,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x28,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x28,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x0c,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0x0c,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4 ; encoding: [0x04,0xff,0x05,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x05,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x11,0xc8,0x01,0x01,0x06,0xff]
+0x04,0xff,0x11,0xc8,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x08,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x08,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x20,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x20,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2c,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2c,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x12,0xc8,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x12,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x02,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x02,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x00,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x00,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x22,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x22,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2a,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2a,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x2e,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x2e,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x14,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x14,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x30,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x30,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x16,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x16,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0e,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0e,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x06,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x06,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0a,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0a,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x28,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x28,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x0c,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x0c,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4 ; encoding: [0x6b,0xfe,0x05,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x05,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x11,0xc8,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0x11,0xc8,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x08,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x08,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x20,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x20,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2c,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2c,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x12,0xc8,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x12,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x02,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x02,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x00,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x00,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x22,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x22,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2a,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2a,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x2e,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x2e,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x14,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x14,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x30,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x30,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x16,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x16,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0e,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0e,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x06,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x06,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0a,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0a,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x28,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x28,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x0c,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x0c,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4 ; encoding: [0x6a,0xfe,0x05,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x05,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x11,0xc8,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0x11,0xc8,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x08,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x08,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x20,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x20,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x12,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x12,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x02,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x02,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x00,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x00,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0x7c,0x0a,0x04,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x04,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x22,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x22,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x2e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x2e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x14,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x14,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x30,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x30,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x16,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x16,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x10,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x10,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0e,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x06,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x06,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0a,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x28,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x28,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmac_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x0c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x0c,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x89,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x89,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xa1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xad,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xad,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, 0.5, v4 ; encoding: [0xc1,0xfe,0x93,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x93,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456 ; encoding: [0xc1,0xfe,0x83,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x83,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x81,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x81,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v4 ; encoding: [0xc1,0xfe,0x85,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x85,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa3,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xa3,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xab,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xab,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xaf,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xaf,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x95,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x95,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_i32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xb1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xb1,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x97,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x97,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0xfe,0x91,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x91,0xc8,0xfd,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8f,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x8f,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_mul_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x87,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x87,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8b,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x8b,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0xa9,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0xa9,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, -1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, src_scc, v4 ; encoding: [0xc1,0xfe,0x8d,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0xfe,0x8d,0xc8,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x89,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x89,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xa1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xad,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xad,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xf0,0xfe,0x93,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x93,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456 ; encoding: [0xf0,0xfe,0x83,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x83,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmac_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x81,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x81,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v4 ; encoding: [0xf0,0xfe,0x85,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x85,0xc8,0xf0,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa3,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xa3,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xab,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xab,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xaf,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xaf,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_max_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x95,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x95,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_i32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xb1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xb1,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_min_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x97,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x97,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0xfe,0x91,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x91,0xc8,0xf0,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8f,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x8f,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_mul_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x87,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x87,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8b,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x8b,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0xa9,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0xa9,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0.5, 0xaf123456, v255 :: v_dual_subrev_f32 v6, 0.5, v3 ; encoding: [0xf0,0xfe,0x8d,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0xfe,0x8d,0xc8,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x89,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x89,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, null, v255 ; encoding: [0xff,0xfe,0xa1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xa1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xad,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xad,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, null, v255 ; encoding: [0xff,0xfe,0x93,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x93,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456 ; encoding: [0xff,0xfe,0x83,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x83,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmac_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x81,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x81,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 ; encoding: [0xff,0xfe,0x85,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x85,0xc8,0x7c,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, null, v255 ; encoding: [0xff,0xfe,0xa3,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xa3,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, null, v255 ; encoding: [0xff,0xfe,0xab,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xab,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xaf,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xaf,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_max_num_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x95,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x95,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_i32 v6, null, v255 ; encoding: [0xff,0xfe,0xb1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xb1,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_min_num_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x97,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x97,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x91,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8f,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x8f,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_mul_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x87,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x87,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8b,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x8b,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, null, v255 ; encoding: [0xff,0xfe,0xa9,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xa9,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 :: v_dual_subrev_f32 v6, null, v255 ; encoding: [0xff,0xfe,0x8d,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x8d,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x89,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x89,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xa1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xad,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xad,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456 ; encoding: [0x7f,0xfe,0x83,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x83,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x81,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x81,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v4 ; encoding: [0x7f,0xfe,0x85,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x85,0xc8,0x7f,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa3,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xa3,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xab,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xab,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xaf,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xaf,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x95,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x95,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xb1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xb1,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x97,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x97,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x7f,0xfe,0x91,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x91,0xc8,0x7f,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8f,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x8f,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x87,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x87,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8b,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x8b,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0xa9,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xa9,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_hi, v255 ; encoding: [0x7f,0xfe,0x8d,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x8d,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x89,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x89,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xa1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xad,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xad,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456 ; encoding: [0x7e,0xfe,0x83,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x83,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x81,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x81,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v4 ; encoding: [0x7e,0xfe,0x85,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x85,0xc8,0x7e,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa3,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xa3,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xab,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xab,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xaf,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xaf,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x95,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x95,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xb1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xb1,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x97,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x97,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x7e,0xfe,0x91,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x91,0xc8,0x7e,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8f,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x8f,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x87,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x87,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8b,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x8b,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0xa9,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xa9,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, exec_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, exec_lo, v255 ; encoding: [0x7e,0xfe,0x8d,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x8d,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x89,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x89,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xa1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xad,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xad,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456 ; encoding: [0x7d,0xfe,0x83,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x83,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmac_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x81,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x81,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v4 ; encoding: [0x7d,0xfe,0x85,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x85,0xc8,0x7d,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa3,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xa3,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xab,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xab,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xaf,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xaf,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_max_num_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x95,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x95,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_i32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xb1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xb1,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_min_num_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x97,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x97,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x91,0xc8,0x7d,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8f,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x8f,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_mul_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x87,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x87,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8b,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x8b,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, m0, v255 ; encoding: [0x7d,0xfe,0xa9,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xa9,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, m0, 0xaf123456, v255 :: v_dual_subrev_f32 v6, m0, v255 ; encoding: [0x7d,0xfe,0x8d,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x8d,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x89,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x89,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xa1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xad,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xad,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456 ; encoding: [0x01,0xfe,0x83,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x83,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x81,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x81,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v4 ; encoding: [0x01,0xfe,0x85,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x85,0xc8,0x01,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa3,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xa3,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s1, v255 ; encoding: [0x01,0xfe,0xab,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xab,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xaf,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xaf,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x95,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x95,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_i32 v6, s1, v255 ; encoding: [0x01,0xfe,0xb1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xb1,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x97,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x97,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x01,0xfe,0x91,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x91,0xc8,0x01,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8f,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x8f,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_mul_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x87,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x87,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8b,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x8b,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s1, v255 ; encoding: [0x01,0xfe,0xa9,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xa9,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s1, v255 ; encoding: [0x01,0xfe,0x8d,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x8d,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x89,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x89,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xa1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xad,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xad,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456 ; encoding: [0x69,0xfe,0x83,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x83,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmac_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x81,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x81,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v4 ; encoding: [0x69,0xfe,0x85,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x85,0xc8,0x69,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa3,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xa3,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, s105, v255 ; encoding: [0x69,0xfe,0xab,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xab,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xaf,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xaf,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_max_num_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x95,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x95,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_i32 v6, s105, v255 ; encoding: [0x69,0xfe,0xb1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xb1,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_min_num_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x97,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x97,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x69,0xfe,0x91,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x91,0xc8,0x69,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8f,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x8f,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_mul_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x87,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x87,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8b,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x8b,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, s105, v255 ; encoding: [0x69,0xfe,0xa9,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xa9,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, s105, 0xaf123456, v255 :: v_dual_subrev_f32 v6, s105, v255 ; encoding: [0x69,0xfe,0x8d,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x8d,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x89,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x89,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xa1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xad,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xad,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456 ; encoding: [0xfd,0xfe,0x83,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x83,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmac_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x81,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x81,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v4 ; encoding: [0xfd,0xfe,0x85,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x85,0xc8,0xc1,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa3,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xa3,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xab,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xab,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xaf,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xaf,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_max_num_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x95,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x95,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_i32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xb1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xb1,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_min_num_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x97,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x97,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x91,0xc8,0xc1,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8f,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x8f,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_mul_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x87,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x87,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8b,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x8b,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, -1, v255 ; encoding: [0xfd,0xfe,0xa9,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xa9,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, src_scc, 0xaf123456, v255 :: v_dual_subrev_f32 v6, -1, v255 ; encoding: [0xfd,0xfe,0x8d,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x8d,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x89,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x89,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xa1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xad,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xad,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456 ; encoding: [0x7b,0xfe,0x83,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x83,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmac_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x81,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x81,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v4 ; encoding: [0x7b,0xfe,0x85,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x85,0xc8,0x7b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa3,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xa3,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xab,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xab,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xaf,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xaf,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_max_num_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x95,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x95,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_i32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xb1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xb1,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_min_num_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x97,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x97,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7b,0xfe,0x91,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x91,0xc8,0x7b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8f,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x8f,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_mul_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x87,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x87,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8b,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x8b,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0xa9,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xa9,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, ttmp15, 0xaf123456, v255 :: v_dual_subrev_f32 v6, ttmp15, v255 ; encoding: [0x7b,0xfe,0x8d,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x8d,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x89,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x89,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v255, v255 ; encoding: [0x01,0xff,0xa1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xa1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xad,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xad,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v255, v255 ; encoding: [0x01,0xff,0x93,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x93,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456 ; encoding: [0x01,0xff,0x83,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x83,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x81,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x81,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v4 ; encoding: [0x01,0xff,0x85,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x85,0xc8,0xff,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v255, v255 ; encoding: [0x01,0xff,0xa3,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xa3,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v255, v255 ; encoding: [0x01,0xff,0xab,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xab,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xaf,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xaf,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x95,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x95,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_i32 v6, v255, v255 ; encoding: [0x01,0xff,0xb1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xb1,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x97,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x97,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x91,0xc8,0xff,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8f,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x8f,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_mul_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x87,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x87,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8b,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x8b,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v255, v255 ; encoding: [0x01,0xff,0xa9,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xa9,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v1, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v255, v255 ; encoding: [0x01,0xff,0x8d,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x8d,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x89,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x89,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v3, v255 ; encoding: [0x02,0xff,0xa1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xa1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xad,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xad,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v3, v255 ; encoding: [0x02,0xff,0x93,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x93,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456 ; encoding: [0x02,0xff,0x83,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x83,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x81,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x81,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v4 ; encoding: [0x02,0xff,0x85,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x85,0xc8,0x03,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v3, v255 ; encoding: [0x02,0xff,0xa3,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xa3,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v3, v255 ; encoding: [0x02,0xff,0xab,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xab,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xaf,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xaf,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x95,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x95,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_i32 v6, v3, v255 ; encoding: [0x02,0xff,0xb1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xb1,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x97,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x97,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x91,0xc8,0x03,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8f,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x8f,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_mul_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x87,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x87,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8b,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x8b,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v3, v255 ; encoding: [0x02,0xff,0xa9,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xa9,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v2, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v3, v255 ; encoding: [0x02,0xff,0x8d,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x8d,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x89,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x89,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v2, v255 ; encoding: [0xff,0xff,0xa1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xa1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xad,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xad,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v2, v255 ; encoding: [0xff,0xff,0x93,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x93,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456 ; encoding: [0xff,0xff,0x83,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x83,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x81,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x81,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v4 ; encoding: [0xff,0xff,0x85,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x85,0xc8,0x02,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v2, v255 ; encoding: [0xff,0xff,0xa3,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xa3,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v2, v255 ; encoding: [0xff,0xff,0xab,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xab,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xaf,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xaf,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x95,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x95,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_i32 v6, v2, v255 ; encoding: [0xff,0xff,0xb1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xb1,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x97,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x97,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x91,0xc8,0x02,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8f,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x8f,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_mul_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x87,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x87,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8b,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x8b,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v2, v255 ; encoding: [0xff,0xff,0xa9,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xa9,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v255, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v2, v255 ; encoding: [0xff,0xff,0x8d,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x8d,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x89,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x89,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v4, v255 ; encoding: [0x03,0xff,0xa1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xa1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xad,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xad,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v4, v255 ; encoding: [0x03,0xff,0x93,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x93,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456 ; encoding: [0x03,0xff,0x83,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x83,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x81,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x81,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v4 ; encoding: [0x03,0xff,0x85,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x85,0xc8,0x04,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v4, v255 ; encoding: [0x03,0xff,0xa3,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xa3,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v4, v255 ; encoding: [0x03,0xff,0xab,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xab,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xaf,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xaf,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x95,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x95,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_i32 v6, v4, v255 ; encoding: [0x03,0xff,0xb1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xb1,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x97,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x97,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x91,0xc8,0x04,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8f,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x8f,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_mul_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x87,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x87,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8b,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x8b,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v4, v255 ; encoding: [0x03,0xff,0xa9,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xa9,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v3, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v4, v255 ; encoding: [0x03,0xff,0x8d,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x8d,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x89,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x89,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, v1, v255 ; encoding: [0x04,0xff,0xa1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xa1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xad,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xad,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_cndmask_b32 v6, v1, v255 ; encoding: [0x04,0xff,0x93,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x93,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456 ; encoding: [0x04,0xff,0x83,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x83,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmac_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x81,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x81,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v4 ; encoding: [0x04,0xff,0x85,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x85,0xc8,0x01,0x09,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, v1, v255 ; encoding: [0x04,0xff,0xa3,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xa3,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, v1, v255 ; encoding: [0x04,0xff,0xab,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xab,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xaf,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xaf,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_max_num_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x95,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x95,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_i32 v6, v1, v255 ; encoding: [0x04,0xff,0xb1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xb1,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_min_num_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x97,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x97,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x91,0xc8,0x01,0x01,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8f,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x8f,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_mul_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x87,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x87,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8b,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x8b,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, v1, v255 ; encoding: [0x04,0xff,0xa9,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xa9,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, v4, 0xaf123456, v255 :: v_dual_subrev_f32 v6, v1, v255 ; encoding: [0x04,0xff,0x8d,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x8d,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x89,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x89,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xa1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xad,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xad,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456 ; encoding: [0x6b,0xfe,0x83,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x83,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x81,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x81,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v4 ; encoding: [0x6b,0xfe,0x85,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x85,0xc8,0x6b,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa3,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xa3,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xab,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xab,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xaf,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xaf,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x95,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x95,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xb1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xb1,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x97,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x97,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x6b,0xfe,0x91,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x91,0xc8,0x6b,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8f,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x8f,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x87,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x87,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8b,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x8b,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0xa9,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xa9,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_hi, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_hi, v255 ; encoding: [0x6b,0xfe,0x8d,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x8d,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x89,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x89,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_add_nc_u32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xa1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_ashrrev_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xad,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xad,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456 ; encoding: [0x6a,0xfe,0x83,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x83,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmac_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x81,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x81,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v4 ; encoding: [0x6a,0xfe,0x85,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x85,0xc8,0x6a,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshlrev_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa3,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xa3,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_lshrrev_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xab,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xab,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xaf,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xaf,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_max_num_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x95,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x95,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_i32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xb1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xb1,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_min_num_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x97,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x97,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x6a,0xfe,0x91,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x91,0xc8,0x6a,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8f,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x8f,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_mul_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x87,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x87,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8b,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x8b,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_sub_nc_u32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0xa9,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xa9,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v255, vcc_lo, 0xaf123456, v255 :: v_dual_subrev_f32 v6, vcc_lo, v255 ; encoding: [0x6a,0xfe,0x8d,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x8d,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x88,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x88,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_add_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xa0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_ashrrev_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xac,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xac,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_cndmask_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x92,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x92,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456 ; encoding: [0x7c,0x08,0x82,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x82,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmac_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x80,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x80,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0x7c,0x08,0x84,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x84,0xc8,0xff,0xfe,0xff,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshlrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa2,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xa2,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_lshrrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xaa,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xaa,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xae,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xae,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_max_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x94,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x94,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xb0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xb0,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_min_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x96,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x96,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x08,0x90,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x90,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8e,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x8e,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_mul_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x86,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x86,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8a,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x8a,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_sub_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0xa8,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0xa8,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_fmamk_f32 v6, null, 0xaf123456, v4 :: v_dual_subrev_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x08,0x8c,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x08,0x8c,0xc8,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x88,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x88,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa0,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa0,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xac,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xac,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x92,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x92,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x82,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x82,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x80,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x80,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x84,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x84,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa2,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa2,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xaa,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xaa,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xae,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xae,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x94,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x94,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xb0,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xb0,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x96,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x96,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x90,0xca,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x90,0xca,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8e,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8e,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x86,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x86,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8a,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8a,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa8,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa8,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8c,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8c,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x88,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x88,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa0,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa0,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xac,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xac,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x92,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x92,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x82,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x82,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x80,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x80,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x84,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x84,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa2,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa2,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xaa,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xaa,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xae,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xae,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x94,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x94,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xb0,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xb0,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x96,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x96,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x90,0xca,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x90,0xca,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8e,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8e,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x86,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x86,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8a,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8a,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa8,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa8,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8c,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8c,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x88,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x88,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xac,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xac,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x92,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x92,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x82,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x82,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x80,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x80,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xa2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xaa,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xaa,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xae,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xae,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x94,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x94,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xb0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xb0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x96,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x96,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8e,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x86,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x86,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8a,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8c,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x85,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x91,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x88,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x88,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa0,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa0,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xac,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xac,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x92,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x92,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x82,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x82,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x80,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x80,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa2,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa2,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xaa,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xaa,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xae,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xae,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x94,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x94,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xb0,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xb0,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x96,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x96,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8e,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8e,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x86,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x86,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8a,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8a,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa8,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa8,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8c,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8c,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x85,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x85,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x91,0xca,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0x91,0xca,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x88,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x88,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa0,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa0,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xac,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xac,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x92,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x92,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x82,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x82,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x80,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x80,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa2,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa2,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xaa,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xaa,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xae,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xae,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x94,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x94,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xb0,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xb0,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x96,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x96,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8e,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8e,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x86,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x86,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8a,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8a,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa8,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa8,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8c,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8c,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x85,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x85,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x91,0xca,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0x91,0xca,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x88,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x88,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa0,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa0,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xac,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xac,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x92,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x92,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x82,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x82,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x80,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x80,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa2,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa2,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xaa,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xaa,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xae,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xae,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x94,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x94,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xb0,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xb0,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x96,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x96,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8e,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8e,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x86,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x86,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8a,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8a,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa8,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa8,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8c,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8c,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x85,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x85,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xca,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x91,0xca,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x88,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x88,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa0,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa0,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xac,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xac,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x92,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0x92,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x82,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x82,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x80,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x80,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xa2,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa2,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xaa,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xaa,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xae,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xae,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x94,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x94,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xb0,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xb0,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x96,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x96,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8e,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8e,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x86,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x86,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8a,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8a,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa8,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa8,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8c,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8c,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x85,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x85,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x91,0xca,0x69,0x00,0x06,0xff]
+0x01,0xfe,0x91,0xca,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x88,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x88,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa0,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa0,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xac,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xac,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x92,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0x92,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x82,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x82,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x80,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x80,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xa2,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa2,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xaa,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xaa,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xae,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xae,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x94,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x94,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xb0,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xb0,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x96,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x96,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8e,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8e,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x86,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x86,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8a,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8a,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa8,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa8,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8c,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8c,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x85,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x85,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x91,0xca,0x01,0x00,0x06,0xff]
+0x69,0xfe,0x91,0xca,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x88,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x88,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa0,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa0,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xac,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xac,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x92,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x92,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x82,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x82,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x80,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x80,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa2,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa2,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xaa,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xaa,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xae,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xae,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x94,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x94,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xb0,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xb0,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x96,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x96,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8e,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8e,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x86,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x86,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8a,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8a,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa8,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa8,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8c,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8c,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x85,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x85,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xca,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x91,0xca,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x88,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x88,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa0,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa0,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xac,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xac,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x92,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x92,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x82,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x82,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x80,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x80,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa2,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa2,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xaa,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xaa,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xae,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xae,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x94,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x94,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xb0,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xb0,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x96,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x96,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8e,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8e,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x86,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x86,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8a,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8a,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa8,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa8,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8c,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8c,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x85,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x85,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x91,0xca,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0x91,0xca,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x88,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x88,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa0,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa0,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xac,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xac,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x92,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x92,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x82,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x82,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x80,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x80,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xa2,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa2,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xaa,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xaa,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xae,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xae,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x94,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x94,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xb0,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xb0,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x96,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x96,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8e,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8e,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x86,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x86,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8a,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8a,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa8,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa8,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8c,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8c,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x85,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x85,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xca,0xff,0x01,0x06,0xff]
+0x01,0xff,0x91,0xca,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x88,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x88,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa0,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa0,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xac,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xac,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x92,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x92,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x82,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x82,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x80,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x80,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xa2,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa2,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xaa,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xaa,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xae,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xae,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x94,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x94,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xb0,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xb0,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x96,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x96,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8e,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8e,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x86,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x86,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8a,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8a,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa8,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa8,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8c,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8c,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x85,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x85,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xca,0x03,0x01,0x06,0xff]
+0x02,0xff,0x91,0xca,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x88,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x88,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa0,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa0,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xac,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xac,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x92,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x92,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x82,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x82,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x80,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x80,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xa2,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa2,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xaa,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xaa,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xae,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xae,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x94,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x94,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xb0,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xb0,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x96,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x96,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8e,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8e,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x86,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x86,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8a,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8a,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa8,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa8,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8c,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8c,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x85,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x85,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xca,0x02,0x01,0x06,0xff]
+0xff,0xff,0x91,0xca,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x88,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x88,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa0,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa0,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xac,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xac,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x92,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x92,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x82,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x82,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x80,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x80,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xa2,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa2,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xaa,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xaa,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xae,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xae,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x94,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x94,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xb0,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xb0,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x96,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x96,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8e,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8e,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x86,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x86,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8a,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8a,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa8,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa8,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8c,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8c,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x85,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x85,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xca,0x04,0x01,0x06,0xff]
+0x03,0xff,0x91,0xca,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x88,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x88,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa0,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa0,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xac,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xac,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x92,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x92,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x82,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x82,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x80,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x80,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xa2,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa2,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xaa,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xaa,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xae,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xae,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x94,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x94,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xb0,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xb0,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x96,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x96,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8e,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8e,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x86,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x86,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8a,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8a,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa8,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa8,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8c,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8c,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x85,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x85,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xca,0x01,0x01,0x06,0xff]
+0x04,0xff,0x91,0xca,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x88,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x88,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa0,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa0,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xac,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xac,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x92,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x92,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x82,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x82,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x80,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x80,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa2,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa2,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xaa,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xaa,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xae,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xae,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x94,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x94,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xb0,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xb0,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x96,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x96,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8e,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8e,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x86,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x86,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8a,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8a,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa8,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa8,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8c,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8c,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x85,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x85,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x91,0xca,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0x91,0xca,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x88,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x88,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa0,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa0,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xac,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xac,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x92,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x92,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x82,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x82,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x80,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x80,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa2,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa2,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xaa,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xaa,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xae,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xae,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x94,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x94,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xb0,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xb0,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x96,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x96,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8e,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8e,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x86,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x86,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8a,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8a,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa8,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa8,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8c,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8c,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x85,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x85,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x91,0xca,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0x91,0xca,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x88,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x88,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xac,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xac,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x92,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x92,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x82,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x82,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x80,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x80,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x84,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x84,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xaa,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xaa,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xae,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xae,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x94,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x94,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xb0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xb0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x96,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x96,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x90,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x90,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8e,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x86,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x86,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8a,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_max_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8c,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc8,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe0,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xec,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd2,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc2,0xca,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc0,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc4,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe2,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xea,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xee,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd4,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xf0,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd6,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xca,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0xd0,0xca,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xce,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc6,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xca,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe8,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xca,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xcc,0xca,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc8,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe0,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xec,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd2,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc2,0xca,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc0,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc4,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe2,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xea,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xee,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd4,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xf0,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd6,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xca,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0xd0,0xca,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xce,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc6,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xca,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe8,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xca,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xcc,0xca,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xec,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe2,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xea,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xee,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd4,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xf0,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xce,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc6,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xca,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe8,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xcc,0xca,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xc5,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xd1,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc8,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe0,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xec,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xca,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0xd2,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0xc2,0xca,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc0,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe2,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xea,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xee,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd4,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xf0,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd6,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xce,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc6,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xca,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe8,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xca,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xcc,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xc5,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xca,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0xd1,0xca,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc8,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe0,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xec,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xca,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0xd2,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0xc2,0xca,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc0,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe2,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xea,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xee,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd4,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xf0,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd6,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xce,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc6,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xca,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe8,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xca,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xcc,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xc5,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xca,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0xd1,0xca,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc8,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe0,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xec,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd2,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0xc2,0xca,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc0,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe2,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xea,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xee,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd4,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xf0,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd6,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xce,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc6,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xca,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe8,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xca,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xcc,0xca,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xc5,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xca,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0xd1,0xca,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc8,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe0,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xec,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xca,0x01,0x06,0x06,0xff]
+0x01,0x04,0xd2,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0xc2,0xca,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc0,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe2,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xea,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xee,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd4,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xf0,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd6,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xce,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc6,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xca,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe8,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xca,0x69,0x06,0x06,0xff]
+0x01,0x04,0xcc,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xc5,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xca,0x69,0x00,0x06,0xff]
+0x01,0xfe,0xd1,0xca,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc8,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe0,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xec,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xca,0x69,0x06,0x06,0xff]
+0x69,0x04,0xd2,0xca,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0xc2,0xca,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc0,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe2,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xea,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xee,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd4,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xf0,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd6,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xce,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc6,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xca,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe8,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xca,0x01,0x06,0x06,0xff]
+0x69,0x04,0xcc,0xca,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xc5,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xca,0x01,0x00,0x06,0xff]
+0x69,0xfe,0xd1,0xca,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc8,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe0,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xec,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd2,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0xc2,0xca,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc0,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe2,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xea,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xee,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd4,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xf0,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd6,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xce,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc6,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xca,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe8,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xca,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xcc,0xca,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xc5,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xca,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0xd1,0xca,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc8,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe0,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xec,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xca,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0xd2,0xca,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0xc2,0xca,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc0,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe2,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xea,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xee,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd4,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xf0,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd6,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xce,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc6,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xca,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe8,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xca,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xcc,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xc5,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xca,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0xd1,0xca,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc8,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe0,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xec,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd2,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0xc2,0xca,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc0,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe2,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xea,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xee,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd4,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xf0,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd6,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xce,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc6,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xca,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe8,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xca,0xff,0x07,0x06,0xff]
+0x01,0x05,0xcc,0xca,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xc5,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xca,0xff,0x01,0x06,0xff]
+0x01,0xff,0xd1,0xca,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc8,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe0,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xec,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd2,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0xc2,0xca,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc0,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe2,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xea,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xee,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd4,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xf0,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd6,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xce,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc6,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xca,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe8,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xca,0x03,0x07,0x06,0xff]
+0x02,0x05,0xcc,0xca,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xc5,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xca,0x03,0x01,0x06,0xff]
+0x02,0xff,0xd1,0xca,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc8,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe0,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xec,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd2,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0xc2,0xca,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc0,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe2,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xea,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xee,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd4,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xf0,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd6,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xce,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc6,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xca,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe8,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xca,0x02,0x07,0x06,0xff]
+0xff,0x05,0xcc,0xca,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xc5,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xca,0x02,0x01,0x06,0xff]
+0xff,0xff,0xd1,0xca,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc8,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe0,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xec,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd2,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0xc2,0xca,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc0,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe2,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xea,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xee,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd4,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xf0,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd6,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xce,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc6,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xca,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe8,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xca,0x04,0x07,0x06,0xff]
+0x03,0x05,0xcc,0xca,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xc5,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xca,0x04,0x01,0x06,0xff]
+0x03,0xff,0xd1,0xca,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc8,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe0,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xec,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd2,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0xc2,0xca,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc0,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe2,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xea,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xee,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd4,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xf0,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd6,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xce,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc6,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xca,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe8,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xca,0x01,0x07,0x06,0xff]
+0x04,0x05,0xcc,0xca,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xc5,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xca,0x01,0x01,0x06,0xff]
+0x04,0xff,0xd1,0xca,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc8,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe0,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xec,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xca,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0xd2,0xca,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0xc2,0xca,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc0,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe2,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xea,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xee,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd4,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xf0,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd6,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xce,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc6,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xca,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe8,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xca,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xcc,0xca,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xc5,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xca,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0xd1,0xca,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc8,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe0,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xec,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xca,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0xd2,0xca,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0xc2,0xca,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc0,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe2,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xea,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xee,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd4,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xf0,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd6,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xce,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc6,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xca,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe8,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xca,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xcc,0xca,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xc5,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xca,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0xd1,0xca,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xec,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe2,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xea,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xee,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd4,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xf0,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd0,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xce,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc6,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xca,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe8,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_min_num_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xcc,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x08,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x08,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x20,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x20,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2c,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x2c,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x12,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x12,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_fmaak_f32 v6, src_scc, v4, 0xaf123456 ; encoding: [0xc1,0x00,0x02,0xca,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x00,0x02,0xca,0xfd,0x08,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x00,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x00,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x00,0x04,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x00,0x04,0xca,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x22,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x22,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2a,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x2a,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x2e,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x2e,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x14,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x14,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x30,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x30,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x16,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x16,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x00,0x10,0xca,0xfd,0x00,0x06,0xff]
+0xc1,0x00,0x10,0xca,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0e,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x0e,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x06,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x06,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0a,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x0a,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x28,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x28,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v6, src_scc, v4 ; encoding: [0xc1,0x00,0x0c,0xca,0xfd,0x08,0x06,0xff]
+0xc1,0x00,0x0c,0xca,0xfd,0x08,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x08,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x08,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x20,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x20,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2c,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x2c,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x12,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x12,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_fmaak_f32 v6, 0.5, v3, 0xaf123456 ; encoding: [0xf0,0x00,0x02,0xca,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x00,0x02,0xca,0xf0,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x00,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x00,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x00,0x04,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x00,0x04,0xca,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x22,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x22,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2a,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x2a,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x2e,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x2e,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x14,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x14,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x30,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x30,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x16,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x16,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x00,0x10,0xca,0xf0,0x00,0x06,0xff]
+0xf0,0x00,0x10,0xca,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0e,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x0e,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x06,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x06,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0a,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x0a,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x28,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x28,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v6, 0.5, v3 ; encoding: [0xf0,0x00,0x0c,0xca,0xf0,0x06,0x06,0xff]
+0xf0,0x00,0x0c,0xca,0xf0,0x06,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_f32 v6, null, v255 ; encoding: [0xff,0x00,0x08,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x08,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_add_nc_u32 v6, null, v255 ; encoding: [0xff,0x00,0x20,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x20,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_ashrrev_i32 v6, null, v255 ; encoding: [0xff,0x00,0x2c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x2c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_cndmask_b32 v6, null, v255 ; encoding: [0xff,0x00,0x12,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x12,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmaak_f32 v6, null, v255, 0xaf123456 ; encoding: [0xff,0x00,0x02,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x02,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmac_f32 v6, null, v255 ; encoding: [0xff,0x00,0x00,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x00,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0x00,0x04,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x04,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshlrev_b32 v6, null, v255 ; encoding: [0xff,0x00,0x22,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x22,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_lshrrev_b32 v6, null, v255 ; encoding: [0xff,0x00,0x2a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x2a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_i32 v6, null, v255 ; encoding: [0xff,0x00,0x2e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x2e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_max_num_f32 v6, null, v255 ; encoding: [0xff,0x00,0x14,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x14,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_i32 v6, null, v255 ; encoding: [0xff,0x00,0x30,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x30,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_min_num_f32 v6, null, v255 ; encoding: [0xff,0x00,0x16,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x16,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0x00,0x10,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x10,0xca,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_dx9_zero_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x0e,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_mul_f32 v6, null, v255 ; encoding: [0xff,0x00,0x06,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x06,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x0a,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_sub_nc_u32 v6, null, v255 ; encoding: [0xff,0x00,0x28,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x28,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, 0xaf123456 :: v_dual_subrev_f32 v6, null, v255 ; encoding: [0xff,0x00,0x0c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x00,0x0c,0xca,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x08,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x08,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x20,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x20,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2c,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x2c,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v6, exec_hi, v255 ; encoding: [0x7f,0x00,0x12,0xca,0x7f,0xfe,0x07,0xff]
+0x7f,0x00,0x12,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_fmaak_f32 v6, exec_hi, v255, 0xaf123456 ; encoding: [0x7f,0x00,0x02,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x00,0x02,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x00,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x00,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0x00,0x04,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x00,0x04,0xca,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x22,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x22,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2a,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x2a,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x2e,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x2e,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x14,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x14,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x30,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x30,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x16,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x16,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0x00,0x10,0xca,0x6b,0x00,0x06,0xff]
+0x7f,0x00,0x10,0xca,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0e,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x0e,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x06,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x06,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0a,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x0a,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x28,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x28,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v6, vcc_hi, v255 ; encoding: [0x7f,0x00,0x0c,0xca,0x6b,0xfe,0x07,0xff]
+0x7f,0x00,0x0c,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x08,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x08,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x20,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x20,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2c,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x2c,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v6, exec_lo, v255 ; encoding: [0x7e,0x00,0x12,0xca,0x7e,0xfe,0x07,0xff]
+0x7e,0x00,0x12,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_fmaak_f32 v6, exec_lo, v255, 0xaf123456 ; encoding: [0x7e,0x00,0x02,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x00,0x02,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x00,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x00,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0x00,0x04,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x00,0x04,0xca,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x22,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x22,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2a,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x2a,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x2e,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x2e,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x14,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x14,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x30,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x30,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x16,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x16,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0x00,0x10,0xca,0x7b,0x00,0x06,0xff]
+0x7e,0x00,0x10,0xca,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0e,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x0e,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x06,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x06,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0a,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x0a,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x28,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x28,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v6, ttmp15, v255 ; encoding: [0x7e,0x00,0x0c,0xca,0x7b,0xfe,0x07,0xff]
+0x7e,0x00,0x0c,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x08,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x08,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v6, m0, v255 ; encoding: [0x7d,0x00,0x20,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x20,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2c,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x2c,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x12,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x12,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_fmaak_f32 v6, m0, v255, 0xaf123456 ; encoding: [0x7d,0x00,0x02,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x00,0x02,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x00,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x00,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0x00,0x04,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x00,0x04,0xca,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x22,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x22,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2a,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x2a,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x2e,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x2e,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x14,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x14,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v6, m0, v255 ; encoding: [0x7d,0x00,0x30,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x30,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x16,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x16,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0x00,0x10,0xca,0x7d,0x00,0x06,0xff]
+0x7d,0x00,0x10,0xca,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0e,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x0e,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x06,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x06,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0a,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x0a,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v6, m0, v255 ; encoding: [0x7d,0x00,0x28,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x28,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v6, m0, v255 ; encoding: [0x7d,0x00,0x0c,0xca,0x7d,0xfe,0x07,0xff]
+0x7d,0x00,0x0c,0xca,0x7d,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x08,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x08,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v6, s105, v255 ; encoding: [0x01,0x00,0x20,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x20,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x2c,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x2c,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v6, s1, v255 ; encoding: [0x01,0x00,0x12,0xca,0x01,0xfe,0x07,0xff]
+0x01,0x00,0x12,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_fmaak_f32 v6, s1, v255, 0xaf123456 ; encoding: [0x01,0x00,0x02,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x00,0x02,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x00,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x00,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0x00,0x04,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x00,0x04,0xca,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v6, s105, v255 ; encoding: [0x01,0x00,0x22,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x22,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v6, s105, v255 ; encoding: [0x01,0x00,0x2a,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x2a,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x2e,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x2e,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x14,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x14,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v6, s105, v255 ; encoding: [0x01,0x00,0x30,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x30,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x16,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x16,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0x00,0x10,0xca,0x69,0x00,0x06,0xff]
+0x01,0x00,0x10,0xca,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0e,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x0e,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x06,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x06,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0a,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x0a,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v6, s105, v255 ; encoding: [0x01,0x00,0x28,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x28,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v6, s105, v255 ; encoding: [0x01,0x00,0x0c,0xca,0x69,0xfe,0x07,0xff]
+0x01,0x00,0x0c,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x08,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x08,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v6, s1, v255 ; encoding: [0x69,0x00,0x20,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x20,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x2c,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x2c,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v6, s105, v255 ; encoding: [0x69,0x00,0x12,0xca,0x69,0xfe,0x07,0xff]
+0x69,0x00,0x12,0xca,0x69,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_fmaak_f32 v6, s105, v255, 0xaf123456 ; encoding: [0x69,0x00,0x02,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x02,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x00,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x00,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0x00,0x04,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x00,0x04,0xca,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v6, s1, v255 ; encoding: [0x69,0x00,0x22,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x22,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v6, s1, v255 ; encoding: [0x69,0x00,0x2a,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x2a,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x2e,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x2e,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x14,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x14,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v6, s1, v255 ; encoding: [0x69,0x00,0x30,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x30,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x16,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x16,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0x00,0x10,0xca,0x01,0x00,0x06,0xff]
+0x69,0x00,0x10,0xca,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0e,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x0e,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x06,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x06,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0a,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x0a,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v6, s1, v255 ; encoding: [0x69,0x00,0x28,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x28,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v6, s1, v255 ; encoding: [0x69,0x00,0x0c,0xca,0x01,0xfe,0x07,0xff]
+0x69,0x00,0x0c,0xca,0x01,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x08,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x08,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v6, -1, v255 ; encoding: [0xfd,0x00,0x20,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x20,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2c,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x2c,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x12,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x12,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_fmaak_f32 v6, -1, v255, 0xaf123456 ; encoding: [0xfd,0x00,0x02,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x00,0x02,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x00,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x00,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0x00,0x04,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x00,0x04,0xca,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x22,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x22,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2a,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x2a,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x2e,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x2e,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x14,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x14,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v6, -1, v255 ; encoding: [0xfd,0x00,0x30,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x30,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x16,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x16,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0x00,0x10,0xca,0xc1,0x00,0x06,0xff]
+0xfd,0x00,0x10,0xca,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0e,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x0e,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x06,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x06,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0a,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x0a,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v6, -1, v255 ; encoding: [0xfd,0x00,0x28,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x28,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v6, -1, v255 ; encoding: [0xfd,0x00,0x0c,0xca,0xc1,0xfe,0x07,0xff]
+0xfd,0x00,0x0c,0xca,0xc1,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x08,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x08,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x20,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x20,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2c,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x2c,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v6, ttmp15, v255 ; encoding: [0x7b,0x00,0x12,0xca,0x7b,0xfe,0x07,0xff]
+0x7b,0x00,0x12,0xca,0x7b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmaak_f32 v6, ttmp15, v255, 0xaf123456 ; encoding: [0x7b,0x00,0x02,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x00,0x02,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x00,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x00,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0x00,0x04,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x00,0x04,0xca,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x22,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x22,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2a,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x2a,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x2e,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x2e,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x14,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x14,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x30,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x30,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x16,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x16,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0x00,0x10,0xca,0x6a,0x00,0x06,0xff]
+0x7b,0x00,0x10,0xca,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0e,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x0e,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x06,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x06,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0a,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x0a,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x28,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x28,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v6, vcc_lo, v255 ; encoding: [0x7b,0x00,0x0c,0xca,0x6a,0xfe,0x07,0xff]
+0x7b,0x00,0x0c,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x08,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x08,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v6, v255, v255 ; encoding: [0x01,0x01,0x20,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x20,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x2c,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x2c,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x12,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x12,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_fmaak_f32 v6, v255, v255, 0xaf123456 ; encoding: [0x01,0x01,0x02,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x01,0x02,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x00,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x00,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0x01,0x04,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x01,0x04,0xca,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x22,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x22,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v6, v255, v255 ; encoding: [0x01,0x01,0x2a,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x2a,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x2e,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x2e,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x14,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x14,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v6, v255, v255 ; encoding: [0x01,0x01,0x30,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x30,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x16,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x16,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0x01,0x10,0xca,0xff,0x01,0x06,0xff]
+0x01,0x01,0x10,0xca,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0e,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x0e,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x06,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x06,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0a,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x0a,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v6, v255, v255 ; encoding: [0x01,0x01,0x28,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x28,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v6, v255, v255 ; encoding: [0x01,0x01,0x0c,0xca,0xff,0xff,0x07,0xff]
+0x01,0x01,0x0c,0xca,0xff,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x08,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x08,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v6, v3, v255 ; encoding: [0x02,0x01,0x20,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x20,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x2c,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x2c,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x12,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x12,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_fmaak_f32 v6, v3, v255, 0xaf123456 ; encoding: [0x02,0x01,0x02,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x01,0x02,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x00,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x00,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0x01,0x04,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x01,0x04,0xca,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x22,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x22,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v6, v3, v255 ; encoding: [0x02,0x01,0x2a,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x2a,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x2e,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x2e,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x14,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x14,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v6, v3, v255 ; encoding: [0x02,0x01,0x30,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x30,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x16,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x16,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0x01,0x10,0xca,0x03,0x01,0x06,0xff]
+0x02,0x01,0x10,0xca,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0e,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x0e,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x06,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x06,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0a,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x0a,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v6, v3, v255 ; encoding: [0x02,0x01,0x28,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x28,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v6, v3, v255 ; encoding: [0x02,0x01,0x0c,0xca,0x03,0xff,0x07,0xff]
+0x02,0x01,0x0c,0xca,0x03,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x08,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x08,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v6, v2, v255 ; encoding: [0xff,0x01,0x20,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x20,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x2c,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x2c,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x12,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x12,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_fmaak_f32 v6, v2, v255, 0xaf123456 ; encoding: [0xff,0x01,0x02,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x01,0x02,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x00,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x00,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0x01,0x04,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x01,0x04,0xca,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x22,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x22,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v6, v2, v255 ; encoding: [0xff,0x01,0x2a,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x2a,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x2e,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x2e,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x14,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x14,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v6, v2, v255 ; encoding: [0xff,0x01,0x30,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x30,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x16,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x16,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0x01,0x10,0xca,0x02,0x01,0x06,0xff]
+0xff,0x01,0x10,0xca,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0e,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x0e,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x06,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x06,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0a,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x0a,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v6, v2, v255 ; encoding: [0xff,0x01,0x28,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x28,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v6, v2, v255 ; encoding: [0xff,0x01,0x0c,0xca,0x02,0xff,0x07,0xff]
+0xff,0x01,0x0c,0xca,0x02,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x08,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x08,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v6, v4, v255 ; encoding: [0x03,0x01,0x20,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x20,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x2c,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x2c,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x12,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x12,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_fmaak_f32 v6, v4, v255, 0xaf123456 ; encoding: [0x03,0x01,0x02,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x01,0x02,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x00,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x00,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0x01,0x04,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x01,0x04,0xca,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x22,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x22,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v6, v4, v255 ; encoding: [0x03,0x01,0x2a,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x2a,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x2e,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x2e,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x14,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x14,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v6, v4, v255 ; encoding: [0x03,0x01,0x30,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x30,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x16,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x16,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0x01,0x10,0xca,0x04,0x01,0x06,0xff]
+0x03,0x01,0x10,0xca,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0e,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x0e,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x06,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x06,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0a,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x0a,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v6, v4, v255 ; encoding: [0x03,0x01,0x28,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x28,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v6, v4, v255 ; encoding: [0x03,0x01,0x0c,0xca,0x04,0xff,0x07,0xff]
+0x03,0x01,0x0c,0xca,0x04,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x08,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x08,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v6, v1, v255 ; encoding: [0x04,0x01,0x20,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x20,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x2c,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x2c,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x12,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x12,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fmaak_f32 v6, v1, v255, 0xaf123456 ; encoding: [0x04,0x01,0x02,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x01,0x02,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x00,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x00,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0x01,0x04,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x01,0x04,0xca,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x22,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x22,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v6, v1, v255 ; encoding: [0x04,0x01,0x2a,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x2a,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x2e,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x2e,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x14,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x14,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v6, v1, v255 ; encoding: [0x04,0x01,0x30,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x30,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x16,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x16,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0x01,0x10,0xca,0x01,0x01,0x06,0xff]
+0x04,0x01,0x10,0xca,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0e,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x0e,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x06,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x06,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0a,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x0a,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v6, v1, v255 ; encoding: [0x04,0x01,0x28,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x28,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v6, v1, v255 ; encoding: [0x04,0x01,0x0c,0xca,0x01,0xff,0x07,0xff]
+0x04,0x01,0x0c,0xca,0x01,0xff,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x08,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x08,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x20,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x20,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2c,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x2c,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v6, vcc_hi, v255 ; encoding: [0x6b,0x00,0x12,0xca,0x6b,0xfe,0x07,0xff]
+0x6b,0x00,0x12,0xca,0x6b,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmaak_f32 v6, vcc_hi, v255, 0xaf123456 ; encoding: [0x6b,0x00,0x02,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x00,0x02,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x00,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x00,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0x00,0x04,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x00,0x04,0xca,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x22,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x22,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2a,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x2a,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x2e,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x2e,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x14,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x14,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x30,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x30,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x16,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x16,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0x00,0x10,0xca,0x7e,0x00,0x06,0xff]
+0x6b,0x00,0x10,0xca,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0e,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x0e,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x06,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x06,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0a,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x0a,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x28,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x28,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v6, exec_lo, v255 ; encoding: [0x6b,0x00,0x0c,0xca,0x7e,0xfe,0x07,0xff]
+0x6b,0x00,0x0c,0xca,0x7e,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x08,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x08,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x20,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x20,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2c,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x2c,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v6, vcc_lo, v255 ; encoding: [0x6a,0x00,0x12,0xca,0x6a,0xfe,0x07,0xff]
+0x6a,0x00,0x12,0xca,0x6a,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmaak_f32 v6, vcc_lo, v255, 0xaf123456 ; encoding: [0x6a,0x00,0x02,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x00,0x02,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x00,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x00,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0x00,0x04,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x00,0x04,0xca,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x22,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x22,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2a,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x2a,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x2e,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x2e,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x14,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x14,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x30,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x30,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x16,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x16,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0x00,0x10,0xca,0x7f,0x00,0x06,0xff]
+0x6a,0x00,0x10,0xca,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0e,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x0e,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x06,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x06,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0a,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x0a,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x28,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x28,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v6, exec_hi, v255 ; encoding: [0x6a,0x00,0x0c,0xca,0x7f,0xfe,0x07,0xff]
+0x6a,0x00,0x0c,0xca,0x7f,0xfe,0x07,0xff
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_add_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x08,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x08,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_add_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x20,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x20,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_ashrrev_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x2c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_cndmask_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x12,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x12,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_fmaak_f32 v255, 0xaf123456, v5, 0xaf123456 ; encoding: [0x7c,0x00,0x02,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x02,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_fmac_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x00,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x00,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x00,0x04,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x04,0xca,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_lshlrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x22,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x22,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_lshrrev_b32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x2a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_max_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x2e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x2e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_max_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x14,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x14,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_min_i32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x30,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x30,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_min_num_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x16,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x16,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x00,0x10,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x10,0xca,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x0e,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_mul_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x06,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x06,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_sub_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x0a,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_sub_nc_u32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x28,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x28,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mov_b32 v6, null :: v_dual_subrev_f32 v255, 0xaf123456, v5 ; encoding: [0x7c,0x00,0x0c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x00,0x0c,0xca,0xff,0x0a,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc8,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe0,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xec,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd2,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc2,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc0,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc4,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe2,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xea,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xee,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd4,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xf0,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd6,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xc9,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0xd0,0xc9,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xce,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc6,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xca,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe8,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xcc,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc8,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe0,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xec,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd2,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc2,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc0,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc4,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe2,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xea,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xee,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd4,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xf0,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd6,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xc9,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0xd0,0xc9,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xce,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc6,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xca,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe8,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xcc,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xec,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xea,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xee,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd4,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xf0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xce,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc6,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xca,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xcc,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xc5,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xd1,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc8,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe0,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xec,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xc9,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0xd2,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0xc2,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc0,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe2,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xea,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xee,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd4,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xf0,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd6,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xce,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc6,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xca,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe8,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xcc,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xc5,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xc9,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0xd1,0xc9,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc8,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe0,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xec,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xc9,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0xd2,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0xc2,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc0,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe2,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xea,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xee,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd4,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xf0,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd6,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xce,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc6,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xca,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe8,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xcc,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xc5,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xc9,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0xd1,0xc9,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc8,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe0,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xec,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd2,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0xc2,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc0,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe2,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xea,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xee,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd4,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xf0,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd6,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xce,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc6,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xca,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe8,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xcc,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xc5,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xc9,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0xd1,0xc9,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc8,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe0,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xec,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xc9,0x01,0x06,0x06,0xff]
+0x01,0x04,0xd2,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0xc2,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc0,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe2,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xea,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xee,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd4,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xf0,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd6,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xce,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc6,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xca,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe8,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xcc,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xc5,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xc9,0x69,0x00,0x06,0xff]
+0x01,0xfe,0xd1,0xc9,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc8,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe0,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xec,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xc9,0x69,0x06,0x06,0xff]
+0x69,0x04,0xd2,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0xc2,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc0,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe2,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xea,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xee,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd4,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xf0,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd6,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xce,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc6,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xca,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe8,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xcc,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xc5,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xc9,0x01,0x00,0x06,0xff]
+0x69,0xfe,0xd1,0xc9,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc8,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe0,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xec,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd2,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0xc2,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc0,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe2,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xea,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xee,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd4,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xf0,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd6,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xce,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc6,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xca,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe8,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xcc,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xc5,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xc9,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0xd1,0xc9,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc8,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe0,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xec,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xc9,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0xd2,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0xc2,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc0,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe2,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xea,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xee,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd4,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xf0,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd6,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xce,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc6,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xca,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe8,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xcc,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xc5,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xc9,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0xd1,0xc9,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc8,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe0,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xec,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd2,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0xc2,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc0,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe2,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xea,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xee,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd4,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xf0,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd6,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xce,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc6,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xca,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe8,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xcc,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xc5,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xc9,0xff,0x01,0x06,0xff]
+0x01,0xff,0xd1,0xc9,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc8,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe0,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xec,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd2,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0xc2,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc0,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe2,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xea,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xee,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd4,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xf0,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd6,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xce,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc6,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xca,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe8,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xcc,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xc5,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xc9,0x03,0x01,0x06,0xff]
+0x02,0xff,0xd1,0xc9,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc8,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe0,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xec,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd2,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0xc2,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc0,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe2,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xea,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xee,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd4,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xf0,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd6,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xce,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc6,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xca,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe8,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xcc,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xc5,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xc9,0x02,0x01,0x06,0xff]
+0xff,0xff,0xd1,0xc9,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc8,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe0,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xec,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd2,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0xc2,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc0,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe2,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xea,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xee,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd4,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xf0,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd6,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xce,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc6,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xca,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe8,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xcc,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xc5,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xc9,0x04,0x01,0x06,0xff]
+0x03,0xff,0xd1,0xc9,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc8,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe0,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xec,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd2,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0xc2,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc0,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe2,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xea,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xee,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd4,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xf0,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd6,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xce,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc6,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xca,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe8,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xcc,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xc5,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xc9,0x01,0x01,0x06,0xff]
+0x04,0xff,0xd1,0xc9,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc8,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe0,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xec,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xc9,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0xd2,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0xc2,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc0,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe2,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xea,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xee,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd4,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xf0,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd6,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xce,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc6,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xca,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe8,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xcc,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xc5,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xc9,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0xd1,0xc9,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc8,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe0,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xec,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xc9,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0xd2,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0xc2,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc0,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe2,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xea,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xee,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd4,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xf0,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd6,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xce,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc6,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xca,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe8,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xcc,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xc5,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xc9,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0xd1,0xc9,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xec,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xea,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xee,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd4,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xf0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd0,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xce,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc6,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xca,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xcc,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc8,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc8,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe0,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe0,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xec,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xec,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd2,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd2,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0xc2,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc2,0xc8,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc0,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc0,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0xc4,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0xc4,0xc8,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe2,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe2,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xea,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xea,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xee,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xee,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd4,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd4,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xf0,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xf0,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xd6,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xd6,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0xd0,0xc8,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0xd0,0xc8,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xce,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xce,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xc6,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xc6,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xca,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xca,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xe8,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xe8,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xcc,0xc8,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xcc,0xc8,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc8,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc8,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe0,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe0,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xec,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xec,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd2,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd2,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0xc2,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc2,0xc8,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc0,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc0,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0xc4,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0xc4,0xc8,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe2,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe2,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xea,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xea,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xee,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xee,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd4,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd4,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xf0,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xf0,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xd6,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xd6,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0xd0,0xc8,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0xd0,0xc8,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xce,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xce,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xc6,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xc6,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xca,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xca,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xe8,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xe8,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xcc,0xc8,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xcc,0xc8,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xec,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xec,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0xd2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0xc2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xe2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe2,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xea,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xea,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xee,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xee,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd4,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd4,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xf0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xf0,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0xd6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xd6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0xce,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xce,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0xc6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xc6,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0xca,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xca,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xe8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xe8,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0xcc,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xcc,0xc8,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xc5,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xc5,0xc8,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0xd1,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0xd1,0xc8,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc8,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc8,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe0,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe0,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xec,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xec,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0xd2,0xc8,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0xd2,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0xc2,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0xc2,0xc8,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc0,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc0,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe2,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe2,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xea,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xea,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xee,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xee,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd4,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd4,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xf0,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xf0,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xd6,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xd6,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xce,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xce,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xc6,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xc6,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xca,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xca,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xe8,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xe8,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xcc,0xc8,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xcc,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0xc5,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0xc5,0xc8,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0xd1,0xc8,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0xd1,0xc8,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc8,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc8,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe0,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe0,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xec,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xec,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0xd2,0xc8,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0xd2,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0xc2,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0xc2,0xc8,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc0,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc0,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe2,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe2,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xea,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xea,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xee,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xee,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd4,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd4,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xf0,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xf0,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xd6,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xd6,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xce,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xce,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xc6,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xc6,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xca,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xca,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xe8,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xe8,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xcc,0xc8,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xcc,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0xc5,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0xc5,0xc8,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0xd1,0xc8,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0xd1,0xc8,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc8,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc8,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe0,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe0,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xec,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xec,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd2,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd2,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0xc2,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0xc2,0xc8,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc0,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc0,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe2,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe2,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xea,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xea,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xee,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xee,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd4,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd4,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xf0,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xf0,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xd6,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xd6,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xce,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xce,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xc6,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xc6,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xca,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xca,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xe8,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xe8,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0xcc,0xc8,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xcc,0xc8,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0xc5,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0xc5,0xc8,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0xd1,0xc8,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0xd1,0xc8,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc8,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc8,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe0,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe0,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xec,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xec,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0xd2,0xc8,0x01,0x06,0x06,0xff]
+0x01,0x04,0xd2,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0xc2,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0xc2,0xc8,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc0,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc0,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xe2,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe2,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xea,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xea,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xee,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xee,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd4,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd4,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xf0,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xf0,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xd6,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xd6,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xce,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xce,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xc6,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xc6,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xca,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xca,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xe8,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xe8,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0xcc,0xc8,0x69,0x06,0x06,0xff]
+0x01,0x04,0xcc,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0xc5,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0xc5,0xc8,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0xd1,0xc8,0x69,0x00,0x06,0xff]
+0x01,0xfe,0xd1,0xc8,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc8,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc8,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe0,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe0,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xec,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xec,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0xd2,0xc8,0x69,0x06,0x06,0xff]
+0x69,0x04,0xd2,0xc8,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0xc2,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0xc2,0xc8,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc0,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc0,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xe2,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe2,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xea,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xea,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xee,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xee,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd4,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd4,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xf0,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xf0,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xd6,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xd6,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xce,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xce,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xc6,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xc6,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xca,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xca,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xe8,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xe8,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0xcc,0xc8,0x01,0x06,0x06,0xff]
+0x69,0x04,0xcc,0xc8,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0xc5,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0xc5,0xc8,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0xd1,0xc8,0x01,0x00,0x06,0xff]
+0x69,0xfe,0xd1,0xc8,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc8,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc8,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe0,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe0,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xec,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xec,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd2,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd2,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0xc2,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0xc2,0xc8,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc0,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc0,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe2,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe2,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xea,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xea,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xee,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xee,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd4,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd4,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xf0,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xf0,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xd6,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xd6,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xce,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xce,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xc6,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xc6,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xca,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xca,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xe8,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xe8,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0xcc,0xc8,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xcc,0xc8,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0xc5,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0xc5,0xc8,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0xd1,0xc8,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0xd1,0xc8,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc8,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc8,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe0,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe0,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xec,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xec,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0xd2,0xc8,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0xd2,0xc8,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0xc2,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0xc2,0xc8,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc0,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc0,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe2,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe2,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xea,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xea,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xee,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xee,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd4,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd4,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xf0,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xf0,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xd6,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xd6,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xce,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xce,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xc6,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xc6,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xca,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xca,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xe8,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xe8,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xcc,0xc8,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xcc,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0xc5,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0xc5,0xc8,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0xd1,0xc8,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0xd1,0xc8,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc8,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc8,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe0,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe0,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xec,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xec,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xd2,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd2,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0xc2,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0xc2,0xc8,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc0,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc0,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xe2,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe2,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xea,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xea,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xee,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xee,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd4,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd4,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xf0,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xf0,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xd6,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xd6,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xce,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xce,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xc6,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xc6,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xca,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xca,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xe8,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xe8,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0xcc,0xc8,0xff,0x07,0x06,0xff]
+0x01,0x05,0xcc,0xc8,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0xc5,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0xc5,0xc8,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0xd1,0xc8,0xff,0x01,0x06,0xff]
+0x01,0xff,0xd1,0xc8,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc8,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc8,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe0,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe0,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xec,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xec,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xd2,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd2,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0xc2,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0xc2,0xc8,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc0,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc0,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xe2,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe2,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xea,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xea,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xee,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xee,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd4,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd4,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xf0,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xf0,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xd6,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xd6,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xce,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xce,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xc6,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xc6,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xca,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xca,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xe8,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xe8,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0xcc,0xc8,0x03,0x07,0x06,0xff]
+0x02,0x05,0xcc,0xc8,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0xc5,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0xc5,0xc8,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0xd1,0xc8,0x03,0x01,0x06,0xff]
+0x02,0xff,0xd1,0xc8,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc8,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc8,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe0,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe0,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xec,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xec,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xd2,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd2,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0xc2,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0xc2,0xc8,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc0,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc0,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xe2,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe2,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xea,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xea,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xee,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xee,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd4,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd4,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xf0,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xf0,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xd6,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xd6,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xce,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xce,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xc6,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xc6,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xca,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xca,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xe8,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xe8,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0xcc,0xc8,0x02,0x07,0x06,0xff]
+0xff,0x05,0xcc,0xc8,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0xc5,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0xc5,0xc8,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0xd1,0xc8,0x02,0x01,0x06,0xff]
+0xff,0xff,0xd1,0xc8,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc8,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc8,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe0,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe0,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xec,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xec,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xd2,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd2,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0xc2,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0xc2,0xc8,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc0,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc0,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xe2,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe2,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xea,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xea,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xee,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xee,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd4,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd4,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xf0,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xf0,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xd6,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xd6,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xce,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xce,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xc6,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xc6,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xca,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xca,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xe8,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xe8,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0xcc,0xc8,0x04,0x07,0x06,0xff]
+0x03,0x05,0xcc,0xc8,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0xc5,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0xc5,0xc8,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0xd1,0xc8,0x04,0x01,0x06,0xff]
+0x03,0xff,0xd1,0xc8,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc8,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc8,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe0,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe0,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xec,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xec,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xd2,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd2,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0xc2,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0xc2,0xc8,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc0,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc0,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xe2,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe2,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xea,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xea,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xee,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xee,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd4,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd4,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xf0,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xf0,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xd6,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xd6,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xce,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xce,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xc6,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xc6,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xca,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xca,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xe8,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xe8,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0xcc,0xc8,0x01,0x07,0x06,0xff]
+0x04,0x05,0xcc,0xc8,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0xc5,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0xc5,0xc8,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0xd1,0xc8,0x01,0x01,0x06,0xff]
+0x04,0xff,0xd1,0xc8,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc8,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc8,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe0,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe0,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xec,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xec,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0xd2,0xc8,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0xd2,0xc8,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0xc2,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0xc2,0xc8,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc0,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc0,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe2,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe2,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xea,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xea,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xee,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xee,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd4,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd4,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xf0,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xf0,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xd6,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xd6,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xce,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xce,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xc6,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xc6,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xca,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xca,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xe8,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xe8,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xcc,0xc8,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xcc,0xc8,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0xc5,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0xc5,0xc8,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0xd1,0xc8,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0xd1,0xc8,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc8,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc8,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe0,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe0,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xec,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xec,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0xd2,0xc8,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0xd2,0xc8,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0xc2,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0xc2,0xc8,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc0,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc0,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe2,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe2,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xea,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xea,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xee,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xee,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd4,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd4,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xf0,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xf0,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xd6,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xd6,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xce,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xce,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xc6,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xc6,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xca,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xca,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xe8,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xe8,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xcc,0xc8,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xcc,0xc8,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0xc5,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0xc5,0xc8,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0xd1,0xc8,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0xd1,0xc8,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xec,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xec,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0xc2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe2,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xea,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xea,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xee,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xee,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd4,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xf0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xf0,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xd6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0xd0,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xd0,0xc8,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xce,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xce,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xc6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xc6,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xca,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xca,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xe8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xe8,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_mul_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xcc,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xcc,0xc8,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x48,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x48,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x60,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x60,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6c,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6c,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x52,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x52,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x42,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x42,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x40,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x40,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x44,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x44,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x62,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x62,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6a,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6a,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x6e,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x6e,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x54,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x54,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x70,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x70,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x56,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x56,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x50,0xc9,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x50,0xc9,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4e,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4e,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x46,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x46,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4a,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4a,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x68,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x68,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x4c,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x4c,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x48,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x48,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x60,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x60,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6c,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6c,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x52,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x52,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x42,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x42,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x40,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x40,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x44,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x44,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x62,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x62,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6a,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6a,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x6e,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x6e,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x54,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x54,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x70,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x70,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x56,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x56,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x50,0xc9,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x50,0xc9,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4e,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4e,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x46,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x46,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4a,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4a,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x68,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x68,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x4c,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x4c,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x48,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x48,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x60,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x60,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x52,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x52,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x42,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x42,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x40,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x40,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x62,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x62,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0x6a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0x6e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x6e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x54,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x54,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0x70,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x70,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x56,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x56,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x46,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x46,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0x68,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x68,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x4c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x4c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x45,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x45,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x51,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x51,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x48,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x48,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x60,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x60,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6c,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x6c,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x52,0xc9,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x52,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x42,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x42,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x40,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x40,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x62,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x62,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6a,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x6a,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x6e,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x6e,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x54,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x54,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x70,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x70,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x56,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x56,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4e,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x4e,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x46,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x46,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4a,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x4a,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x68,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x68,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x4c,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x4c,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x45,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x45,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x51,0xc9,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0x51,0xc9,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x48,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x48,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x60,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x60,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6c,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x6c,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x52,0xc9,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x52,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x42,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x42,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x40,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x40,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x62,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x62,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6a,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x6a,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x6e,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x6e,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x54,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x54,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x70,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x70,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x56,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x56,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4e,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x4e,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x46,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x46,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4a,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x4a,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x68,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x68,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x4c,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x4c,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x45,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x45,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x51,0xc9,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0x51,0xc9,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x48,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x48,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x60,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x60,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6c,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6c,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x52,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x52,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x42,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x42,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x40,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x40,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x62,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x62,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6a,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6a,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x6e,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x6e,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x54,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x54,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0x70,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x70,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x56,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x56,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4e,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4e,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x46,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x46,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4a,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4a,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0x68,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x68,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x4c,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x4c,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x45,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x45,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x51,0xc9,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x51,0xc9,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x48,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x48,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x60,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x60,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x6c,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x6c,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x52,0xc9,0x01,0x06,0x06,0xff]
+0x01,0x04,0x52,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x42,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x42,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x40,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x40,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x62,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x62,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0x6a,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x6a,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x6e,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x6e,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x54,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x54,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0x70,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x70,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x56,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x56,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4e,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x4e,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x46,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x46,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4a,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x4a,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0x68,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x68,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x4c,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x4c,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x45,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x45,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x51,0xc9,0x69,0x00,0x06,0xff]
+0x01,0xfe,0x51,0xc9,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x48,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x48,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x60,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x60,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x6c,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x6c,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x52,0xc9,0x69,0x06,0x06,0xff]
+0x69,0x04,0x52,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x42,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x42,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x40,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x40,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x62,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x62,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0x6a,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x6a,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x6e,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x6e,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x54,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x54,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0x70,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x70,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x56,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x56,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4e,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x4e,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x46,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x46,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4a,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x4a,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0x68,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x68,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x4c,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x4c,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x45,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x45,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x51,0xc9,0x01,0x00,0x06,0xff]
+0x69,0xfe,0x51,0xc9,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x48,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x48,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x60,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x60,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6c,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6c,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x52,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x52,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x42,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x42,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x40,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x40,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x62,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x62,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6a,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6a,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x6e,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x6e,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x54,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x54,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0x70,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x70,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x56,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x56,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4e,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4e,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x46,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x46,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4a,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4a,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0x68,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x68,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x4c,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x4c,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x45,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x45,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x51,0xc9,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x51,0xc9,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x48,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x48,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x60,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x60,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6c,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x6c,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x52,0xc9,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x52,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x42,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x42,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x40,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x40,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x62,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x62,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6a,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x6a,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x6e,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x6e,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x54,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x54,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x70,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x70,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x56,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x56,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4e,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x4e,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x46,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x46,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4a,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x4a,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x68,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x68,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x4c,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x4c,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x45,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x45,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x51,0xc9,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0x51,0xc9,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x48,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x48,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x60,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x60,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6c,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6c,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x52,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x52,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x42,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x42,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x40,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x40,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x62,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x62,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x6a,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6a,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x6e,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x6e,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x54,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x54,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0x70,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x70,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x56,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x56,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4e,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4e,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x46,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x46,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4a,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4a,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0x68,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x68,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x4c,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x4c,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x45,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x45,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x51,0xc9,0xff,0x01,0x06,0xff]
+0x01,0xff,0x51,0xc9,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x48,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x48,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x60,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x60,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6c,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6c,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x52,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x52,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x42,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x42,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x40,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x40,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x62,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x62,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x6a,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6a,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x6e,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x6e,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x54,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x54,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0x70,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x70,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x56,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x56,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4e,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4e,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x46,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x46,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4a,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4a,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0x68,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x68,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x4c,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x4c,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x45,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x45,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x51,0xc9,0x03,0x01,0x06,0xff]
+0x02,0xff,0x51,0xc9,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x48,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x48,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x60,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x60,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6c,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6c,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x52,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x52,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x42,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x42,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x40,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x40,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x62,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x62,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x6a,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6a,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x6e,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x6e,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x54,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x54,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0x70,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x70,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x56,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x56,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4e,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4e,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x46,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x46,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4a,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4a,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0x68,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x68,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x4c,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x4c,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x45,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x45,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x51,0xc9,0x02,0x01,0x06,0xff]
+0xff,0xff,0x51,0xc9,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x48,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x48,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x60,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x60,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6c,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6c,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x52,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x52,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x42,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x42,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x40,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x40,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x62,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x62,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x6a,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6a,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x6e,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x6e,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x54,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x54,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0x70,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x70,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x56,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x56,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4e,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4e,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x46,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x46,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4a,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4a,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0x68,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x68,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x4c,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x4c,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x45,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x45,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x51,0xc9,0x04,0x01,0x06,0xff]
+0x03,0xff,0x51,0xc9,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x48,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x48,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x60,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x60,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6c,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6c,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x52,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x52,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x42,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x42,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x40,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x40,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x62,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x62,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x6a,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6a,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x6e,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x6e,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x54,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x54,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0x70,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x70,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x56,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x56,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4e,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4e,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x46,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x46,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4a,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4a,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0x68,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x68,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x4c,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x4c,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x45,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x45,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x51,0xc9,0x01,0x01,0x06,0xff]
+0x04,0xff,0x51,0xc9,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x48,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x48,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x60,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x60,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6c,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x6c,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x52,0xc9,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x52,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x42,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x42,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x40,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x40,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x62,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x62,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6a,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x6a,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x6e,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x6e,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x54,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x54,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x70,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x70,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x56,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x56,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4e,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x4e,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x46,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x46,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4a,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x4a,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x68,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x68,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x4c,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x4c,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x45,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x45,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x51,0xc9,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0x51,0xc9,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x48,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x48,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x60,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x60,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6c,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x6c,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x52,0xc9,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x52,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x42,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x42,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x40,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x40,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x62,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x62,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6a,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x6a,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x6e,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x6e,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x54,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x54,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x70,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x70,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x56,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x56,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4e,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x4e,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x46,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x46,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4a,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x4a,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x68,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x68,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x4c,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x4c,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x45,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x45,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x51,0xc9,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0x51,0xc9,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x48,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x48,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x60,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x60,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x52,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x52,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x42,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x42,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x40,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x40,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x44,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x44,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x62,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x62,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x6e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x6e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x54,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x54,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x70,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x70,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x56,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x56,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x50,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x50,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x46,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x46,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x68,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x68,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_sub_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x4c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x4c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x88,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x88,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa0,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa0,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xac,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xac,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x92,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x92,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmaak_f32 v6, src_scc, v5, 0xaf123456 ; encoding: [0xc1,0x08,0x82,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x82,0xc9,0xfd,0x0a,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x80,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x80,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmamk_f32 v6, src_scc, 0xaf123456, v255 ; encoding: [0xc1,0x08,0x84,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xc1,0x08,0x84,0xc9,0xfd,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa2,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa2,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xaa,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xaa,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xae,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xae,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x94,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x94,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xb0,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xb0,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x96,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x96,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v6, src_scc ; encoding: [0xc1,0x08,0x90,0xc9,0xfd,0x00,0x06,0xff]
+0xc1,0x08,0x90,0xc9,0xfd,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8e,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8e,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x86,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x86,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8a,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8a,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0xa8,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0xa8,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v6, src_scc, v5 ; encoding: [0xc1,0x08,0x8c,0xc9,0xfd,0x0a,0x06,0xff]
+0xc1,0x08,0x8c,0xc9,0xfd,0x0a,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x88,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x88,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa0,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa0,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xac,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xac,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x92,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x92,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmaak_f32 v6, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x06,0x82,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x82,0xc9,0xf0,0x04,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x80,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x80,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmamk_f32 v6, 0.5, 0xaf123456, v255 ; encoding: [0xf0,0x06,0x84,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xf0,0x06,0x84,0xc9,0xf0,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa2,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa2,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xaa,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xaa,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xae,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xae,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x94,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x94,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xb0,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xb0,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x96,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x96,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v6, 0.5 ; encoding: [0xf0,0x06,0x90,0xc9,0xf0,0x00,0x06,0xff]
+0xf0,0x06,0x90,0xc9,0xf0,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8e,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8e,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x86,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x86,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8a,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8a,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0xa8,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0xa8,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v6, 0.5, v2 ; encoding: [0xf0,0x06,0x8c,0xc9,0xf0,0x04,0x06,0xff]
+0xf0,0x06,0x8c,0xc9,0xf0,0x04,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_f32 v6, null, v3 ; encoding: [0xff,0x04,0x88,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x88,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_add_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_ashrrev_i32 v6, null, v3 ; encoding: [0xff,0x04,0xac,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xac,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_cndmask_b32 v6, null, v3 ; encoding: [0xff,0x04,0x92,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x92,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmaak_f32 v6, null, v3, 0xaf123456 ; encoding: [0xff,0x04,0x82,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x82,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_fmac_f32 v6, null, v3 ; encoding: [0xff,0x04,0x80,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x80,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshlrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xa2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa2,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_lshrrev_b32 v6, null, v3 ; encoding: [0xff,0x04,0xaa,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xaa,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_i32 v6, null, v3 ; encoding: [0xff,0x04,0xae,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xae,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_max_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x94,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x94,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_i32 v6, null, v3 ; encoding: [0xff,0x04,0xb0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xb0,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_min_num_f32 v6, null, v3 ; encoding: [0xff,0x04,0x96,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x96,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_dx9_zero_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8e,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_mul_f32 v6, null, v3 ; encoding: [0xff,0x04,0x86,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x86,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8a,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_sub_nc_u32 v6, null, v3 ; encoding: [0xff,0x04,0xa8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0xa8,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v2 :: v_dual_subrev_f32 v6, null, v3 ; encoding: [0xff,0x04,0x8c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x04,0x8c,0xc9,0x7c,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_fmamk_f32 v6, null, 0xaf123456, v255 ; encoding: [0xff,0xfe,0x85,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x85,0xc9,0x7c,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, 0xaf123456, v255 :: v_dual_mov_b32 v6, null ; encoding: [0xff,0xfe,0x91,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xfe,0x91,0xc9,0x7c,0x00,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x88,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x88,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa0,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa0,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xac,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xac,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v6, exec_hi, v3 ; encoding: [0x7f,0x04,0x92,0xc9,0x7f,0x06,0x06,0xff]
+0x7f,0x04,0x92,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmaak_f32 v6, exec_hi, v3, 0xaf123456 ; encoding: [0x7f,0x04,0x82,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0x04,0x82,0xc9,0x7f,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x80,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x80,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa2,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa2,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xaa,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xaa,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xae,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xae,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x94,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x94,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xb0,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xb0,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x96,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x96,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8e,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8e,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x86,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x86,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8a,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8a,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0xa8,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0xa8,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v6, vcc_hi, v3 ; encoding: [0x7f,0x04,0x8c,0xc9,0x6b,0x06,0x06,0xff]
+0x7f,0x04,0x8c,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_fmamk_f32 v6, exec_hi, 0xaf123456, v255 ; encoding: [0x7f,0xfe,0x85,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7f,0xfe,0x85,0xc9,0x7f,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v6, vcc_hi ; encoding: [0x7f,0xfe,0x91,0xc9,0x6b,0x00,0x06,0xff]
+0x7f,0xfe,0x91,0xc9,0x6b,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x88,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x88,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa0,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa0,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xac,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xac,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v6, exec_lo, v3 ; encoding: [0x7e,0x04,0x92,0xc9,0x7e,0x06,0x06,0xff]
+0x7e,0x04,0x92,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmaak_f32 v6, exec_lo, v3, 0xaf123456 ; encoding: [0x7e,0x04,0x82,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0x04,0x82,0xc9,0x7e,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x80,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x80,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa2,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa2,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xaa,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xaa,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xae,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xae,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x94,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x94,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xb0,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xb0,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x96,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x96,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8e,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8e,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x86,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x86,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8a,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8a,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0xa8,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0xa8,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v6, ttmp15, v3 ; encoding: [0x7e,0x04,0x8c,0xc9,0x7b,0x06,0x06,0xff]
+0x7e,0x04,0x8c,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_fmamk_f32 v6, exec_lo, 0xaf123456, v255 ; encoding: [0x7e,0xfe,0x85,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7e,0xfe,0x85,0xc9,0x7e,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v6, ttmp15 ; encoding: [0x7e,0xfe,0x91,0xc9,0x7b,0x00,0x06,0xff]
+0x7e,0xfe,0x91,0xc9,0x7b,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x88,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x88,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa0,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa0,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xac,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xac,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0x92,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x92,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmaak_f32 v6, m0, v3, 0xaf123456 ; encoding: [0x7d,0x04,0x82,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0x04,0x82,0xc9,0x7d,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x80,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x80,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa2,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa2,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v6, m0, v3 ; encoding: [0x7d,0x04,0xaa,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xaa,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xae,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xae,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x94,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x94,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v6, m0, v3 ; encoding: [0x7d,0x04,0xb0,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xb0,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x96,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x96,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8e,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8e,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x86,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x86,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8a,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8a,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v6, m0, v3 ; encoding: [0x7d,0x04,0xa8,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0xa8,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v6, m0, v3 ; encoding: [0x7d,0x04,0x8c,0xc9,0x7d,0x06,0x06,0xff]
+0x7d,0x04,0x8c,0xc9,0x7d,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v255 :: v_dual_fmamk_f32 v6, m0, 0xaf123456, v255 ; encoding: [0x7d,0xfe,0x85,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7d,0xfe,0x85,0xc9,0x7d,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v6, m0 ; encoding: [0x7d,0xfe,0x91,0xc9,0x7d,0x00,0x06,0xff]
+0x7d,0xfe,0x91,0xc9,0x7d,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x88,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x88,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa0,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa0,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xac,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xac,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v6, s1, v3 ; encoding: [0x01,0x04,0x92,0xc9,0x01,0x06,0x06,0xff]
+0x01,0x04,0x92,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmaak_f32 v6, s1, v3, 0xaf123456 ; encoding: [0x01,0x04,0x82,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x04,0x82,0xc9,0x01,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x80,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x80,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xa2,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa2,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v6, s105, v3 ; encoding: [0x01,0x04,0xaa,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xaa,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xae,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xae,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x94,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x94,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v6, s105, v3 ; encoding: [0x01,0x04,0xb0,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xb0,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x96,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x96,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8e,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8e,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x86,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x86,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8a,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8a,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v6, s105, v3 ; encoding: [0x01,0x04,0xa8,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0xa8,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v6, s105, v3 ; encoding: [0x01,0x04,0x8c,0xc9,0x69,0x06,0x06,0xff]
+0x01,0x04,0x8c,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v255 :: v_dual_fmamk_f32 v6, s1, 0xaf123456, v255 ; encoding: [0x01,0xfe,0x85,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xfe,0x85,0xc9,0x01,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v6, s105 ; encoding: [0x01,0xfe,0x91,0xc9,0x69,0x00,0x06,0xff]
+0x01,0xfe,0x91,0xc9,0x69,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x88,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x88,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa0,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa0,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xac,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xac,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v6, s105, v3 ; encoding: [0x69,0x04,0x92,0xc9,0x69,0x06,0x06,0xff]
+0x69,0x04,0x92,0xc9,0x69,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmaak_f32 v6, s105, v3, 0xaf123456 ; encoding: [0x69,0x04,0x82,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x69,0x04,0x82,0xc9,0x69,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x80,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x80,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xa2,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa2,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v6, s1, v3 ; encoding: [0x69,0x04,0xaa,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xaa,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xae,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xae,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x94,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x94,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v6, s1, v3 ; encoding: [0x69,0x04,0xb0,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xb0,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x96,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x96,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8e,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8e,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x86,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x86,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8a,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8a,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v6, s1, v3 ; encoding: [0x69,0x04,0xa8,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0xa8,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v6, s1, v3 ; encoding: [0x69,0x04,0x8c,0xc9,0x01,0x06,0x06,0xff]
+0x69,0x04,0x8c,0xc9,0x01,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v255 :: v_dual_fmamk_f32 v6, s105, 0xaf123456, v255 ; encoding: [0x69,0xfe,0x85,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x69,0xfe,0x85,0xc9,0x69,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v6, s1 ; encoding: [0x69,0xfe,0x91,0xc9,0x01,0x00,0x06,0xff]
+0x69,0xfe,0x91,0xc9,0x01,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x88,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x88,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa0,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa0,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xac,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xac,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0x92,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x92,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmaak_f32 v6, -1, v3, 0xaf123456 ; encoding: [0xfd,0x04,0x82,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0x04,0x82,0xc9,0xc1,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x80,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x80,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa2,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa2,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v6, -1, v3 ; encoding: [0xfd,0x04,0xaa,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xaa,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xae,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xae,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x94,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x94,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v6, -1, v3 ; encoding: [0xfd,0x04,0xb0,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xb0,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x96,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x96,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8e,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8e,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x86,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x86,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8a,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8a,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v6, -1, v3 ; encoding: [0xfd,0x04,0xa8,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0xa8,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v6, -1, v3 ; encoding: [0xfd,0x04,0x8c,0xc9,0xc1,0x06,0x06,0xff]
+0xfd,0x04,0x8c,0xc9,0xc1,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_fmamk_f32 v6, -1, 0xaf123456, v255 ; encoding: [0xfd,0xfe,0x85,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xfd,0xfe,0x85,0xc9,0xc1,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v6, -1 ; encoding: [0xfd,0xfe,0x91,0xc9,0xc1,0x00,0x06,0xff]
+0xfd,0xfe,0x91,0xc9,0xc1,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x88,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x88,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa0,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa0,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xac,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xac,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v6, ttmp15, v3 ; encoding: [0x7b,0x04,0x92,0xc9,0x7b,0x06,0x06,0xff]
+0x7b,0x04,0x92,0xc9,0x7b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmaak_f32 v6, ttmp15, v3, 0xaf123456 ; encoding: [0x7b,0x04,0x82,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0x04,0x82,0xc9,0x7b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x80,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x80,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa2,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa2,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xaa,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xaa,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xae,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xae,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x94,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x94,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xb0,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xb0,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x96,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x96,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8e,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8e,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x86,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x86,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8a,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8a,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0xa8,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0xa8,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v6, vcc_lo, v3 ; encoding: [0x7b,0x04,0x8c,0xc9,0x6a,0x06,0x06,0xff]
+0x7b,0x04,0x8c,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_fmamk_f32 v6, ttmp15, 0xaf123456, v255 ; encoding: [0x7b,0xfe,0x85,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x7b,0xfe,0x85,0xc9,0x7b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v6, vcc_lo ; encoding: [0x7b,0xfe,0x91,0xc9,0x6a,0x00,0x06,0xff]
+0x7b,0xfe,0x91,0xc9,0x6a,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x88,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x88,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa0,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa0,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xac,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xac,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v6, v255, v3 ; encoding: [0x01,0x05,0x92,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x92,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmaak_f32 v6, v255, v3, 0xaf123456 ; encoding: [0x01,0x05,0x82,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x01,0x05,0x82,0xc9,0xff,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x80,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x80,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xa2,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa2,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v6, v255, v3 ; encoding: [0x01,0x05,0xaa,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xaa,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xae,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xae,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x94,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x94,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v6, v255, v3 ; encoding: [0x01,0x05,0xb0,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xb0,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x96,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x96,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8e,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8e,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x86,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x86,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8a,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8a,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v6, v255, v3 ; encoding: [0x01,0x05,0xa8,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0xa8,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v6, v255, v3 ; encoding: [0x01,0x05,0x8c,0xc9,0xff,0x07,0x06,0xff]
+0x01,0x05,0x8c,0xc9,0xff,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v255 :: v_dual_fmamk_f32 v6, v255, 0xaf123456, v255 ; encoding: [0x01,0xff,0x85,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x01,0xff,0x85,0xc9,0xff,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v6, v255 ; encoding: [0x01,0xff,0x91,0xc9,0xff,0x01,0x06,0xff]
+0x01,0xff,0x91,0xc9,0xff,0x01,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x88,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x88,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa0,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa0,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xac,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xac,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v6, v3, v3 ; encoding: [0x02,0x05,0x92,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x92,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmaak_f32 v6, v3, v3, 0xaf123456 ; encoding: [0x02,0x05,0x82,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x02,0x05,0x82,0xc9,0x03,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x80,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x80,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xa2,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa2,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v6, v3, v3 ; encoding: [0x02,0x05,0xaa,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xaa,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xae,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xae,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x94,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x94,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v6, v3, v3 ; encoding: [0x02,0x05,0xb0,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xb0,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x96,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x96,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8e,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8e,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x86,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x86,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8a,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8a,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v6, v3, v3 ; encoding: [0x02,0x05,0xa8,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0xa8,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v6, v3, v3 ; encoding: [0x02,0x05,0x8c,0xc9,0x03,0x07,0x06,0xff]
+0x02,0x05,0x8c,0xc9,0x03,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v255 :: v_dual_fmamk_f32 v6, v3, 0xaf123456, v255 ; encoding: [0x02,0xff,0x85,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x02,0xff,0x85,0xc9,0x03,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v6, v3 ; encoding: [0x02,0xff,0x91,0xc9,0x03,0x01,0x06,0xff]
+0x02,0xff,0x91,0xc9,0x03,0x01,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x88,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x88,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa0,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa0,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xac,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xac,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v6, v2, v3 ; encoding: [0xff,0x05,0x92,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x92,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmaak_f32 v6, v2, v3, 0xaf123456 ; encoding: [0xff,0x05,0x82,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0xff,0x05,0x82,0xc9,0x02,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x80,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x80,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xa2,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa2,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v6, v2, v3 ; encoding: [0xff,0x05,0xaa,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xaa,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xae,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xae,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x94,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x94,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v6, v2, v3 ; encoding: [0xff,0x05,0xb0,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xb0,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x96,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x96,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8e,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8e,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x86,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x86,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8a,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8a,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v6, v2, v3 ; encoding: [0xff,0x05,0xa8,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0xa8,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v6, v2, v3 ; encoding: [0xff,0x05,0x8c,0xc9,0x02,0x07,0x06,0xff]
+0xff,0x05,0x8c,0xc9,0x02,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v255 :: v_dual_fmamk_f32 v6, v2, 0xaf123456, v255 ; encoding: [0xff,0xff,0x85,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0xff,0xff,0x85,0xc9,0x02,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v6, v2 ; encoding: [0xff,0xff,0x91,0xc9,0x02,0x01,0x06,0xff]
+0xff,0xff,0x91,0xc9,0x02,0x01,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x88,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x88,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa0,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa0,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xac,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xac,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v6, v4, v3 ; encoding: [0x03,0x05,0x92,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x92,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmaak_f32 v6, v4, v3, 0xaf123456 ; encoding: [0x03,0x05,0x82,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x03,0x05,0x82,0xc9,0x04,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x80,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x80,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xa2,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa2,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v6, v4, v3 ; encoding: [0x03,0x05,0xaa,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xaa,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xae,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xae,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x94,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x94,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v6, v4, v3 ; encoding: [0x03,0x05,0xb0,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xb0,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x96,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x96,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8e,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8e,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x86,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x86,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8a,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8a,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v6, v4, v3 ; encoding: [0x03,0x05,0xa8,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0xa8,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v6, v4, v3 ; encoding: [0x03,0x05,0x8c,0xc9,0x04,0x07,0x06,0xff]
+0x03,0x05,0x8c,0xc9,0x04,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v255 :: v_dual_fmamk_f32 v6, v4, 0xaf123456, v255 ; encoding: [0x03,0xff,0x85,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x03,0xff,0x85,0xc9,0x04,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v6, v4 ; encoding: [0x03,0xff,0x91,0xc9,0x04,0x01,0x06,0xff]
+0x03,0xff,0x91,0xc9,0x04,0x01,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x88,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x88,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa0,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa0,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xac,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xac,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v6, v1, v3 ; encoding: [0x04,0x05,0x92,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x92,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmaak_f32 v6, v1, v3, 0xaf123456 ; encoding: [0x04,0x05,0x82,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x04,0x05,0x82,0xc9,0x01,0x07,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x80,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x80,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xa2,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa2,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v6, v1, v3 ; encoding: [0x04,0x05,0xaa,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xaa,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xae,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xae,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x94,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x94,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v6, v1, v3 ; encoding: [0x04,0x05,0xb0,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xb0,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x96,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x96,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8e,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8e,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x86,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x86,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8a,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8a,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v6, v1, v3 ; encoding: [0x04,0x05,0xa8,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0xa8,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v6, v1, v3 ; encoding: [0x04,0x05,0x8c,0xc9,0x01,0x07,0x06,0xff]
+0x04,0x05,0x8c,0xc9,0x01,0x07,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v255 :: v_dual_fmamk_f32 v6, v1, 0xaf123456, v255 ; encoding: [0x04,0xff,0x85,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x04,0xff,0x85,0xc9,0x01,0xff,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v6, v1 ; encoding: [0x04,0xff,0x91,0xc9,0x01,0x01,0x06,0xff]
+0x04,0xff,0x91,0xc9,0x01,0x01,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x88,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x88,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa0,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa0,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xac,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xac,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v6, vcc_hi, v3 ; encoding: [0x6b,0x04,0x92,0xc9,0x6b,0x06,0x06,0xff]
+0x6b,0x04,0x92,0xc9,0x6b,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmaak_f32 v6, vcc_hi, v3, 0xaf123456 ; encoding: [0x6b,0x04,0x82,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0x04,0x82,0xc9,0x6b,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x80,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x80,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa2,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa2,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xaa,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xaa,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xae,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xae,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x94,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x94,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xb0,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xb0,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x96,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x96,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8e,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8e,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x86,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x86,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8a,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8a,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0xa8,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0xa8,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v6, exec_lo, v3 ; encoding: [0x6b,0x04,0x8c,0xc9,0x7e,0x06,0x06,0xff]
+0x6b,0x04,0x8c,0xc9,0x7e,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_fmamk_f32 v6, vcc_hi, 0xaf123456, v255 ; encoding: [0x6b,0xfe,0x85,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6b,0xfe,0x85,0xc9,0x6b,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v6, exec_lo ; encoding: [0x6b,0xfe,0x91,0xc9,0x7e,0x00,0x06,0xff]
+0x6b,0xfe,0x91,0xc9,0x7e,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x88,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x88,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa0,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa0,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xac,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xac,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v6, vcc_lo, v3 ; encoding: [0x6a,0x04,0x92,0xc9,0x6a,0x06,0x06,0xff]
+0x6a,0x04,0x92,0xc9,0x6a,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmaak_f32 v6, vcc_lo, v3, 0xaf123456 ; encoding: [0x6a,0x04,0x82,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0x04,0x82,0xc9,0x6a,0x06,0x06,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x80,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x80,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa2,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa2,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xaa,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xaa,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xae,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xae,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x94,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x94,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xb0,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xb0,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x96,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x96,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8e,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8e,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x86,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x86,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8a,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8a,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0xa8,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0xa8,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v6, exec_hi, v3 ; encoding: [0x6a,0x04,0x8c,0xc9,0x7f,0x06,0x06,0xff]
+0x6a,0x04,0x8c,0xc9,0x7f,0x06,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_fmamk_f32 v6, vcc_lo, 0xaf123456, v255 ; encoding: [0x6a,0xfe,0x85,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf]
+0x6a,0xfe,0x85,0xc9,0x6a,0xfe,0x07,0xff,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v6, exec_hi ; encoding: [0x6a,0xfe,0x91,0xc9,0x7f,0x00,0x06,0xff]
+0x6a,0xfe,0x91,0xc9,0x7f,0x00,0x06,0xff
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_add_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x88,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x88,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_add_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_ashrrev_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xac,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xac,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_cndmask_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x92,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x92,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmaak_f32 v255, 0xaf123456, v4, 0xaf123456 ; encoding: [0x7c,0x0a,0x82,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x82,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmac_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x80,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x80,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_fmamk_f32 v255, 0xaf123456, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x84,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x84,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_lshlrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa2,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_lshrrev_b32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xaa,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xaa,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_max_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xae,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xae,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_max_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x94,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x94,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_min_i32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xb0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xb0,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_min_num_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x96,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x96,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_mov_b32 v255, 0xaf123456 ; encoding: [0x7c,0x0a,0x90,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x90,0xc9,0xff,0x00,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_dx9_zero_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8e,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_mul_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x86,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x86,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8a,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_sub_nc_u32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0xa8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0xa8,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
+
+# GFX1250: v_dual_subrev_f32 v6, null, v5 :: v_dual_subrev_f32 v255, 0xaf123456, v4 ; encoding: [0x7c,0x0a,0x8c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf]
+0x7c,0x0a,0x8c,0xc9,0xff,0x08,0xfe,0x06,0x56,0x34,0x12,0xaf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd3.txt
new file mode 100644
index 0000000000000..5fc7152d19ec0
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vopd3.txt
@@ -0,0 +1,14278 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck %s -strict-whitespace --check-prefix=GFX1250
+
+# GFX1250: v_dual_add_f32 v0, -v1, v2 :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x01,0x01,0x11,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x01,0x11,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_add_f32 v0, v1, v2 :: v_dual_add_f32 v5, -s6, v7 ; encoding: [0x01,0x41,0x10,0xcf,0x06,0x10,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x41,0x10,0xcf,0x06,0x10,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x11,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x10,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x11,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x10,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x11,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x10,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x10,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x11,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x10,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x10,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x11,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x10,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x10,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x10,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x11,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x10,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x10,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x10,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x11,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x10,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x10,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x10,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x11,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x10,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x10,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x10,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x10,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x11,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x10,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x10,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x11,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x10,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x10,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x10,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x11,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x10,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x10,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x10,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x11,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x10,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x10,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x10,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x11,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x10,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x10,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x10,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 ; encoding: [0x04,0x21,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x21,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x11,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x10,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x10,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x10,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x10,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x11,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x10,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x10,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x10,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x10,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x11,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x10,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x10,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x10,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[0:1], -s[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, -v7 ; encoding: [0x08,0x60,0x84,0xcf,0x06,0x27,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x60,0x84,0xcf,0x06,0x27,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_add_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_mov_b32 v5, v6 ; encoding: [0x08,0x81,0x84,0xcf,0x06,0x03,0x04,0x00,0x00,0x00,0x00,0x05]
+0x08,0x81,0x84,0xcf,0x06,0x03,0x04,0x00,0x00,0x00,0x00,0x05
+
+# GFX1250: v_dual_add_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x84,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+0x06,0x41,0x84,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x40,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x00,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+0xc1,0x90,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+0xc1,0x00,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x10,0x85,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xa0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xb0,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+0xc1,0x80,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x70,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x30,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x50,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x60,0x84,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x40,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x00,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+0xf0,0x90,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+0xf0,0x00,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x10,0x85,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xa0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xb0,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+0xf0,0x80,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x70,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x30,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x50,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x60,0x84,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x84,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7e,0x80,0x84,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x40,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x00,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x84,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7e,0x90,0x84,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7e,0x00,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x10,0x85,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xa0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xb0,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x70,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x30,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x50,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x60,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x84,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x68,0x80,0x84,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x40,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x00,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x68,0x00,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x10,0x85,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xa0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xb0,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x70,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x30,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x50,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x60,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x84,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x80,0x84,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x90,0x84,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x84,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfd,0x80,0x84,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x40,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x00,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfd,0x90,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfd,0x00,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x10,0x85,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xa0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xb0,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x70,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x30,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x50,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x60,0x84,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x84,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7a,0x80,0x84,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x40,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x00,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7a,0x90,0x84,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7a,0x00,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x10,0x85,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xa0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xb0,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x70,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x30,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x50,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x60,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfe,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x41,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x01,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfe,0x91,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfe,0x01,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x11,0x85,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xa1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xb1,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x71,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x31,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x51,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x61,0x84,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x84,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x81,0x84,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x41,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x01,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x91,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x02,0x01,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x11,0x85,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xa1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xb1,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x71,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x31,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x51,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x61,0x84,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x84,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x04,0x81,0x84,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x85,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x85,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x84,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x84,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x06,0x81,0x84,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x41,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x01,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x06,0x91,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x06,0x01,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x11,0x85,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xa1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xb1,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x71,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x31,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x51,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x61,0x84,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x84,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x6a,0x80,0x84,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x40,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x00,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x6a,0x90,0x84,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x6a,0x00,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x10,0x85,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xa0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xb0,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x70,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x30,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x50,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x60,0x84,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x61,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x92 ; encoding: [0x06,0x21,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x92,0x07]
+0x06,0x21,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x92,0x07
+
+# GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+0x06,0x31,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09
+
+# GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x51,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_add_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x41,0x85,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_add_nc_u32 v0, v1, v2 :: v_dual_fma_f32 v5, -v6, v7, -v8 ; encoding: [0x01,0x31,0x41,0xcf,0x06,0x51,0x02,0x00,0x00,0x07,0x08,0x05]
+0x01,0x31,0x41,0xcf,0x06,0x51,0x02,0x00,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x40,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x40,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x61,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x84 ; encoding: [0x04,0x21,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x84,0x07]
+0x04,0x21,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x84,0x07
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+0x04,0x31,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x51,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_add_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x41,0x41,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x41,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x40,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x41,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x40,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x41,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x40,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x40,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x41,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x40,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x40,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x41,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x40,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x40,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x40,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x41,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x40,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x40,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x40,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x41,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x40,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x40,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x40,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x41,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x40,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x40,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x40,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x40,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x41,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x40,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x40,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x41,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x40,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x40,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x40,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x41,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x40,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x40,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x40,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x41,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x40,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x40,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x40,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x41,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x40,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x40,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x40,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x41,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x40,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x40,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x40,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x40,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x41,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x40,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x40,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x40,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x40,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x41,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x40,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_add_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x40,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x40,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x58,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x58,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x61,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x87 ; encoding: [0x04,0x21,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x87,0x07]
+0x04,0x21,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x87,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+0x04,0x31,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x51,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_ashrrev_i32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x41,0x59,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x59,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x58,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x59,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x58,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x59,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x58,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x58,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x59,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x58,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x58,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x59,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x58,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x58,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x58,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x59,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x58,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x58,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x58,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x59,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x58,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x58,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x58,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x59,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x58,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x58,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x58,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x58,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x59,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x58,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x58,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x59,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x58,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x58,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x58,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x59,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x58,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x58,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x58,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x59,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x58,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x58,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x58,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x59,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x58,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x58,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x58,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x59,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x58,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x58,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x58,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x58,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x59,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x58,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x58,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x58,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x58,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x59,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x58,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_ashrrev_i32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x58,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x58,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_add_f32 v5, -s6, -v7 ; encoding: [0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x60,0x00,0x07,0x00,0x05]
+0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x60,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_cndmask_b32 v0, v1, v2, s96 :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x60,0x00,0x07,0x00,0x05]
+0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x60,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_add_f32 v5, -s6, -v7 ; encoding: [0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x6a,0x00,0x07,0x00,0x05]
+0x01,0x41,0x24,0xcf,0x06,0x30,0x02,0x6a,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_cndmask_b32 v0, v1, v2, vcc_lo :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x6a,0x00,0x07,0x00,0x05]
+0x01,0x01,0x24,0xcf,0x06,0x31,0x02,0x6a,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x25,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, -1, v4, vcc_lo :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x24,0xcf,0xfd,0x00,0x04,0x6a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x25,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, 0.5, v3, vcc_lo :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x24,0xcf,0xf0,0x00,0x03,0x6a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x40,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x7f,0x00,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x60,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x00,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x7f,0x10,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x7f,0x50,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x70,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x7f,0xa0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x7f,0x80,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x7f,0xb0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x70,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x30,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x50,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x7f,0x40,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x25,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x7f,0x60,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x24,0xcf,0x7f,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_hi, v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x7f,0x80,0x24,0xcf,0x7f,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x24,0xcf,0x7f,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x40,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x7e,0x00,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x60,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x00,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x7e,0x10,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x7e,0x50,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x70,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x7e,0xa0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x7e,0x80,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x7e,0xb0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x70,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x30,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x50,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x7e,0x40,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x25,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x7e,0x60,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x24,0xcf,0x7e,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, exec_lo, v255, vcc_lo :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x7e,0x80,0x24,0xcf,0x7e,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x24,0xcf,0x7e,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x25,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v2, vcc_lo :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x24,0xcf,0x7d,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, m0, v255, vcc_lo :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x24,0xcf,0x7d,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x24,0xcf,0x7d,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x01,0x40,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x40,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x01,0x00,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x00,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x01,0x60,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x60,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x01,0x00,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x00,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x01,0x10,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x10,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x01,0x50,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x50,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x01,0x70,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x70,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x01,0xa0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x01,0x80,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x80,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x01,0xb0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x01,0x70,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x70,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x01,0x30,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x30,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x01,0x50,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x50,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x01,0x40,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x40,0x25,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v2, vcc_lo :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x01,0x60,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x60,0x24,0xcf,0x01,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s1, v255, vcc_lo :: v_dual_mov_b32 v7, s1 ; encoding: [0x01,0x80,0x24,0xcf,0x01,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x01,0x80,0x24,0xcf,0x01,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x69,0x40,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x40,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x69,0x00,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x00,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x69,0x60,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x60,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x69,0x00,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x00,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x69,0x10,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x10,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x69,0x50,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x50,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x69,0x70,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x70,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x69,0xa0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x69,0x80,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x80,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x69,0xb0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x69,0x70,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x70,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x69,0x30,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x30,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x69,0x50,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x50,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x69,0x40,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x40,0x25,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v2, vcc_lo :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x69,0x60,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x69,0x60,0x24,0xcf,0x69,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, s105, v255, vcc_lo :: v_dual_mov_b32 v7, s105 ; encoding: [0x69,0x80,0x24,0xcf,0x69,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x69,0x80,0x24,0xcf,0x69,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x25,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v2, vcc_lo :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x24,0xcf,0xc1,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, src_scc, v255, vcc_lo :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x24,0xcf,0xc1,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x24,0xcf,0xc1,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x40,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7b,0x00,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x60,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x00,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7b,0x10,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7b,0x50,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x70,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7b,0xa0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7b,0x80,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7b,0xb0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x70,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x30,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x50,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7b,0x40,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x25,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v2, vcc_lo :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7b,0x60,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x24,0xcf,0x7b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, ttmp15, v255, vcc_lo :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7b,0x80,0x24,0xcf,0x7b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x24,0xcf,0x7b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x41,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x01,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x61,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x01,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x11,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x51,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x71,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x81,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x71,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x31,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x51,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x41,0x25,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v2, vcc_lo :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x01,0x61,0x24,0xcf,0xff,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v1, v255, vcc_lo :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x24,0xcf,0xff,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x01,0x81,0x24,0xcf,0xff,0x01,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x41,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x01,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x61,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x01,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x11,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x51,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x71,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x81,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x71,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x31,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x51,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x41,0x25,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v2, vcc_lo :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x02,0x61,0x24,0xcf,0x03,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v2, v255, vcc_lo :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x24,0xcf,0x03,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x02,0x81,0x24,0xcf,0x03,0x01,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x41,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x01,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x61,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x01,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x11,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x51,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x71,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x81,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x71,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x31,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x51,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x41,0x25,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v2, vcc_lo :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0xff,0x61,0x24,0xcf,0x02,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v255, v255, vcc_lo :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x24,0xcf,0x02,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+0xff,0x81,0x24,0xcf,0x02,0x01,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x41,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x01,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x61,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x01,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x11,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x51,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x71,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x81,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x71,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x31,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x51,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x41,0x25,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v2, vcc_lo :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x03,0x61,0x24,0xcf,0x04,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v3, v255, vcc_lo :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x24,0xcf,0x04,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x03,0x81,0x24,0xcf,0x04,0x01,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1 ; encoding: [0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x01,0x07]
+0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x01,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x04,0x07]
+0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s96 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07]
+0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x60,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, s97 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x61,0xff,0x03,0x60,0x07]
+0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x61,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x41,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x01,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x61,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_bitop2_b32 v7, v1, v3 bitop3:1 ; encoding: [0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x01,0x07]
+0x04,0x21,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x01,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x04,0x07]
+0x04,0x31,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x01,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x11,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x51,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x71,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x81,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x71,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x31,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x51,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x41,0x25,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v2, vcc_lo :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x04,0x61,0x24,0xcf,0x01,0x01,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v255, s96 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x60,0xff,0x00,0x00,0x07]
+0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x60,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, v4, v255, vcc_lo :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x04,0x81,0x24,0xcf,0x01,0x01,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x40,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x6b,0x00,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x60,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x00,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x6b,0x10,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x6b,0x50,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x70,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0xa0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x6b,0x80,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0xb0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x70,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x30,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x50,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x6b,0x40,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x25,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x6b,0x60,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x24,0xcf,0x6b,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_hi, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x6b,0x80,0x24,0xcf,0x6b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x24,0xcf,0x6b,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x40,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x6a,0x00,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x60,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_cndmask_b32 v28, -v15, v15, s46 :: v_dual_cndmask_b32 v29, -v13, -v13, s46 ; encoding: [0x0f,0x91,0x24,0xcf,0x0d,0x33,0x0f,0x2e,0x1c,0x0d,0x2e,0x1d]
+0x0f,0x91,0x24,0xcf,0x0d,0x33,0x0f,0x2e,0x1c,0x0d,0x2e,0x1d
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x00,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x6a,0x10,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x6a,0x50,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x70,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0xa0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x6a,0x80,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0xb0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x70,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x30,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x50,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x6a,0x40,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x25,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v2, vcc_lo :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x6a,0x60,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x24,0xcf,0x6a,0x00,0x02,0x6a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_cndmask_b32 v255, vcc_lo, v255, vcc_lo :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x6a,0x80,0x24,0xcf,0x6a,0x00,0xff,0x6a,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x24,0xcf,0x6a,0x00,0xff,0x6a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v0, -s1, v2, v3 :: v_dual_bitop2_b32 v5, v6, v7 ; encoding: [0x01,0x20,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x00,0x05]
+0x01,0x20,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_fma_f32 v0, -v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x03,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v0, v1, -v2, v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x05,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x05,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v0, v1, v2, -v3 :: v_dual_fma_f32 v5, v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x09,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x09,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, -v6, v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x11,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x11,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, -v7, v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x21,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x21,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v0, v1, v2, v3 :: v_dual_fma_f32 v5, v6, v7, -v8 ; encoding: [0x01,0x31,0x4d,0xcf,0x06,0x41,0x02,0x03,0x00,0x07,0x08,0x05]
+0x01,0x31,0x4d,0xcf,0x06,0x41,0x02,0x03,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+0x04,0x61,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_bitop2_b32 v7, v1, v3 ; encoding: [0x04,0x21,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x00,0x07]
+0x04,0x21,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x04,0x07]
+0x04,0x31,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x03,0x04,0x07
+
+# GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+0x04,0x51,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_fma_f32 v1, v4, v2, v10 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09]
+0x04,0x41,0x4d,0xcf,0x01,0x01,0x02,0x0a,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_fma_f32 v254, v4, v2, v10 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x4d,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, -1, v4, v10 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x4c,0xcf,0xfd,0x00,0x04,0x0a,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x4d,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, 0.5, v3, v10 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x4c,0xcf,0xf0,0x00,0x03,0x0a,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x4d,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v2, v10 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_hi, v255, v10 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x4c,0xcf,0x6b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x4c,0xcf,0x6b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x4d,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v2, v10 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, exec_lo, v255, v10 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x4c,0xcf,0x7b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x4c,0xcf,0x7b,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x4d,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v2, v10 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x4c,0xcf,0x7d,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, m0, v255, v10 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x4c,0xcf,0x7d,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x4c,0xcf,0x7d,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x40,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x00,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x00,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x10,0x4d,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x70,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x30,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x50,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v2, v10 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x60,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s1, v255, v10 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x4c,0xcf,0x69,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x01,0x80,0x4c,0xcf,0x69,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x40,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x00,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x4c,0xcf,0x69,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x00,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x10,0x4d,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x70,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x30,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x50,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v2, v10 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x69,0x60,0x4c,0xcf,0x01,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, s105, v255, v10 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x4c,0xcf,0x01,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x69,0x80,0x4c,0xcf,0x01,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x4d,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v2, v10 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x4c,0xcf,0xc1,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, src_scc, v255, v10 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x4c,0xcf,0xc1,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x4c,0xcf,0xc1,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x4c,0xcf,0x7b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x4d,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v2, v10 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, ttmp15, v255, v10 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x4c,0xcf,0x6a,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x4c,0xcf,0x6a,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x41,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x01,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x01,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x11,0x4d,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x71,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x31,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x51,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v2, v10 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x01,0x61,0x4c,0xcf,0xff,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v1, v255, v10 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x4c,0xcf,0xff,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x01,0x81,0x4c,0xcf,0xff,0x01,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x41,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x01,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x01,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x11,0x4d,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x71,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x31,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x51,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v2, v10 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x02,0x61,0x4c,0xcf,0x03,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v2, v255, v10 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x4c,0xcf,0x03,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x02,0x81,0x4c,0xcf,0x03,0x01,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x41,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x01,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x01,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x11,0x4d,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x71,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x31,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x51,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v2, v10 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0xff,0x61,0x4c,0xcf,0x02,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v255, v255, v10 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x4c,0xcf,0x02,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+0xff,0x81,0x4c,0xcf,0x02,0x01,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x41,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x01,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x01,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x11,0x4d,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x71,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x31,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x51,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v2, v10 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x03,0x61,0x4c,0xcf,0x04,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v3, v255, v10 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x4c,0xcf,0x04,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x03,0x81,0x4c,0xcf,0x04,0x01,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x01,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x60,0x07]
+0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x01,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x11,0x4d,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x71,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x31,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x51,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v2, v10 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x04,0x61,0x4c,0xcf,0x01,0x01,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, v4, v255, v10 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x4c,0xcf,0x01,0x01,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x04,0x81,0x4c,0xcf,0x01,0x01,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x4c,0xcf,0x6b,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x4d,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v2, v10 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x4c,0xcf,0x7e,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_hi, v255, v10 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x4c,0xcf,0x7e,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x4c,0xcf,0x7e,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x4c,0xcf,0x6a,0x00,0x02,0x0a,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x4d,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v2, v10 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x4c,0xcf,0x7f,0x00,0x02,0x0a,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f32 v255, vcc_lo, v255, v10 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x4c,0xcf,0x7f,0x00,0xff,0x0a,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x4c,0xcf,0x7f,0x00,0xff,0x0a,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[0:1], -v[8:9], -v[4:5], -v[10:11] :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x08,0x01,0x81,0xcf,0x06,0x0f,0x04,0x0a,0x00,0x07,0x00,0x05]
+0x08,0x01,0x81,0xcf,0x06,0x0f,0x04,0x0a,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_fma_f64 v[0:1], v[8:9], v[4:5], -v[10:11] :: v_dual_fma_f32 v5, v6, v7, -v8 ; encoding: [0x08,0x31,0x81,0xcf,0x06,0x49,0x04,0x0a,0x00,0x07,0x08,0x05]
+0x08,0x31,0x81,0xcf,0x06,0x49,0x04,0x0a,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_fma_f64 v[252:253], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfc,0x03,0x00,0x08]
+0x06,0x41,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfc,0x03,0x00,0x08
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x40,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x00,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x6a,0x07]
+0xc1,0x90,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x09]
+0xc1,0x00,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x10,0x81,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0xa0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0xb0,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x00,0x00,0x07]
+0xc1,0x80,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x70,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x30,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x50,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], -1, v[6:7], v[10:11] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07]
+0xc1,0x60,0x80,0xcf,0xfd,0x00,0x06,0x0a,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x40,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x00,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x6a,0x07]
+0xf0,0x90,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x09]
+0xf0,0x00,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x10,0x81,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0xa0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0xb0,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x00,0x00,0x07]
+0xf0,0x80,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x70,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x30,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x50,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], 0.5, v[8:9], v[10:11] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07]
+0xf0,0x60,0x80,0xcf,0xf0,0x00,0x08,0x0a,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[254:255], v[10:11] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x80,0xcf,0x7b,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x7e,0x80,0x80,0xcf,0x7b,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x40,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x00,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x80,0xcf,0x0a,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x7e,0x90,0x80,0xcf,0x0a,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x7e,0x00,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x10,0x81,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0xa0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0xb0,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x70,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x30,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x50,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], exec, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7e,0x60,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x80,0xcf,0x01,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x68,0x80,0x80,0xcf,0x01,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x40,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x00,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x68,0x00,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x10,0x81,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0xa0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0xb0,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x70,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x30,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x50,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[104:105], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x68,0x60,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x80,0xcf,0x69,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x02,0x80,0x80,0xcf,0x69,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], s[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x02,0x90,0x80,0xcf,0x01,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x80,0xcf,0xc1,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0xfd,0x80,0x80,0xcf,0xc1,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x40,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x00,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0xfd,0x90,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0xfd,0x00,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x10,0x81,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0xa0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0xb0,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x70,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x30,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x50,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], src_scc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfd,0x60,0x80,0xcf,0xc1,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[254:255], v[10:11] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x80,0xcf,0x6a,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x7a,0x80,0x80,0xcf,0x6a,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x40,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x00,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x7a,0x90,0x80,0xcf,0x0f,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x7a,0x00,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x10,0x81,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0xa0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0xb0,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x70,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x30,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x50,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], ttmp[14:15], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x7a,0x60,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0xfe,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x41,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x01,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0xfe,0x91,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0xfe,0x01,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x11,0x81,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0xa1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0xb1,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x71,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x31,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x51,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[254:255], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0xfe,0x61,0x80,0xcf,0x05,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x80,0xcf,0xfd,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x02,0x81,0x80,0xcf,0xfd,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x41,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x01,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x02,0x91,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x02,0x01,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x11,0x81,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0xa1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0xb1,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x71,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x31,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x51,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[2:3], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x02,0x61,0x80,0xcf,0xfd,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x80,0xcf,0x03,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x04,0x81,0x80,0xcf,0x03,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x60,0x07]
+0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x60,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x81,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x81,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x80,0xcf,0x02,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[4:5], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x80,0xcf,0x03,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[254:255], v[10:11] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x06,0x81,0x80,0xcf,0x04,0x01,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x41,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x01,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x06,0x91,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x06,0x01,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x11,0x81,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0xa1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0xb1,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x71,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x31,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x51,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], v[6:7], v[4:5], v[10:11] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x06,0x61,0x80,0xcf,0x04,0x01,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[254:255], v[10:11] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x80,0xcf,0x7f,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07]
+0x6a,0x80,0x80,0xcf,0x7f,0x00,0xfe,0x0a,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x40,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x00,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07]
+0x6a,0x90,0x80,0xcf,0x6a,0x00,0x04,0x0a,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09]
+0x6a,0x00,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x10,0x81,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0xa0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0xb0,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x70,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x30,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x50,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[254:255], vcc, v[4:5], v[10:11] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07]
+0x6a,0x60,0x80,0xcf,0x7f,0x00,0x04,0x0a,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+0x06,0x61,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x91 ; encoding: [0x06,0x21,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x03,0x91,0x07]
+0x06,0x21,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x03,0x91,0x07
+
+# GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x04,0x09]
+0x06,0x31,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x04,0x09
+
+# GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+0x06,0x51,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_fma_f64 v[2:3], v[6:7], v[4:5], v[10:11] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09]
+0x06,0x41,0x81,0xcf,0x01,0x01,0x04,0x0a,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v0, -v1, -v2 :: v_dual_ashrrev_i32 v5, v6, v7 ; encoding: [0x01,0x61,0x01,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x61,0x01,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_fmac_f32 v0, v1, -v2 :: v_dual_fmac_f32 v5, -v6, v7 ; encoding: [0x01,0x01,0x00,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x01,0x00,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x01,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x00,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x01,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x00,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x01,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x00,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x00,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x01,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x00,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x00,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x01,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x00,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x00,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x00,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x01,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x00,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x00,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x00,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x01,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x00,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x00,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x00,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x01,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x00,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x00,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x00,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x00,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x01,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x00,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x00,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x01,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x00,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x00,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x00,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x01,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x00,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x00,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x00,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x01,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x00,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x00,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x00,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x01,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x00,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x00,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x00,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x14 ; encoding: [0x04,0x21,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x14,0x07]
+0x04,0x21,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x14,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x01,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x00,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x00,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x00,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x00,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x01,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x00,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x00,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x00,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x00,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x01,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x00,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x00,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x00,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_fmac_f32 v7, -1, v4 :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x00,0xcf,0xfd,0x00,0x04,0x00,0x07,0x05,0x00,0x09]
+0xc1,0x00,0x00,0xcf,0xfd,0x00,0x04,0x00,0x07,0x05,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, 0.5, v3 :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x00,0xcf,0xf0,0x00,0x03,0x00,0x07,0x02,0x00,0x09]
+0xf0,0x00,0x00,0xcf,0xf0,0x00,0x03,0x00,0x07,0x02,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, exec_hi, v2 :: v_dual_fmac_f32 v9, vcc_hi, v3 ; encoding: [0x7f,0x00,0x00,0xcf,0x6b,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x7f,0x00,0x00,0xcf,0x6b,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, exec_lo, v2 :: v_dual_fmac_f32 v9, ttmp15, v3 ; encoding: [0x7e,0x00,0x00,0xcf,0x7b,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x7e,0x00,0x00,0xcf,0x7b,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, m0, v2 :: v_dual_fmac_f32 v9, m0, v3 ; encoding: [0x7d,0x00,0x00,0xcf,0x7d,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x7d,0x00,0x00,0xcf,0x7d,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, s1, v2 :: v_dual_fmac_f32 v9, s105, v3 ; encoding: [0x01,0x00,0x00,0xcf,0x69,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x01,0x00,0x00,0xcf,0x69,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, s105, v2 :: v_dual_fmac_f32 v9, s1, v3 ; encoding: [0x69,0x00,0x00,0xcf,0x01,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x69,0x00,0x00,0xcf,0x01,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, src_scc, v2 :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x00,0xcf,0xc1,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0xfd,0x00,0x00,0xcf,0xc1,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, ttmp15, v2 :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7b,0x00,0x00,0xcf,0x6a,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x7b,0x00,0x00,0xcf,0x6a,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, v1, v2 :: v_dual_fmac_f32 v9, v255, v3 ; encoding: [0x01,0x01,0x00,0xcf,0xff,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+0x01,0x01,0x00,0xcf,0xff,0x01,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, v2, v2 :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x02,0x01,0x00,0xcf,0x03,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+0x02,0x01,0x00,0xcf,0x03,0x01,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, v255, v2 :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0xff,0x01,0x00,0xcf,0x02,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+0xff,0x01,0x00,0xcf,0x02,0x01,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, v3, v2 :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x03,0x01,0x00,0xcf,0x04,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+0x03,0x01,0x00,0xcf,0x04,0x01,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, v4, v2 :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x04,0x01,0x00,0xcf,0x01,0x01,0x02,0x00,0x07,0x03,0x00,0x09]
+0x04,0x01,0x00,0xcf,0x01,0x01,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, vcc_hi, v2 :: v_dual_fmac_f32 v9, exec_lo, v3 ; encoding: [0x6b,0x00,0x00,0xcf,0x7e,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x6b,0x00,0x00,0xcf,0x7e,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_fmac_f32 v7, vcc_lo, v2 :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x00,0xcf,0x7f,0x00,0x02,0x00,0x07,0x03,0x00,0x09]
+0x6a,0x00,0x00,0xcf,0x7f,0x00,0x02,0x00,0x07,0x03,0x00,0x09
+
+# GFX1250: v_dual_lshlrev_b32 v0, v1, v2 :: v_dual_min_num_f32 v5, -s6, -v7 ; encoding: [0x01,0xb1,0x44,0xcf,0x06,0x30,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0xb1,0x44,0xcf,0x06,0x30,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x44,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x44,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x61,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x85 ; encoding: [0x04,0x21,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x85,0x07]
+0x04,0x21,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x85,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+0x04,0x31,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x51,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshlrev_b32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x41,0x45,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x45,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x44,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x45,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x44,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x45,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x44,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x44,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x45,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x44,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x44,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x45,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x44,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x44,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x44,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x45,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x44,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x44,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x44,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x45,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x44,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x44,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x44,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x45,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x44,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x44,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x44,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x44,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x45,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x44,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x44,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x45,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x44,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x44,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x44,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x45,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x44,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x44,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x44,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x45,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x44,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x44,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x44,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x45,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x44,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x44,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x44,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x45,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x44,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x44,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x44,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x44,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x45,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x44,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x44,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x44,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x44,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x45,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x44,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshlrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x44,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x44,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+0x04,0x61,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+0x04,0x51,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshrrev_b32 v1, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09]
+0x04,0x41,0x55,0xcf,0x01,0x01,0x02,0x00,0x01,0x0d,0x00,0x09
+
+# GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x54,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x54,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x88 ; encoding: [0x04,0x21,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x88,0x07]
+0x04,0x21,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x88,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+0x04,0x31,0x55,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x55,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x54,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x55,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x54,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x55,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x54,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x54,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x55,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x54,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x54,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x55,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x54,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x54,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x54,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x55,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x54,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x54,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x54,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x55,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x54,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x54,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x54,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x55,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x54,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x54,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x54,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x54,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x55,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x54,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x54,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x55,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x54,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x54,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x54,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x55,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x54,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x54,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x54,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x55,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x54,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x54,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x54,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x55,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x54,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x54,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x54,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x55,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x54,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x54,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x54,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x54,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x55,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x54,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x54,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x54,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x54,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x55,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x54,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_lshrrev_b32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x54,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x54,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v0, -v1, v2 :: v_dual_min_num_f32 v5, v6, v7 ; encoding: [0x01,0xb1,0x28,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0xb1,0x28,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_max_num_f32 v0, -v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7 ; encoding: [0x01,0x71,0x28,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x71,0x28,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, s96 ; encoding: [0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x60,0x05]
+0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x60,0x05
+
+# GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_cndmask_b32 v5, v6, v7, vcc_lo ; encoding: [0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x6a,0x05]
+0x01,0x91,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x6a,0x05
+
+# GFX1250: v_dual_max_num_f32 v0, v1, -v2 :: v_dual_mov_b32 v5, v6 ; encoding: [0x01,0x81,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x00,0x00,0x05]
+0x01,0x81,0x28,0xcf,0x06,0x05,0x02,0x00,0x00,0x00,0x00,0x05
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x29,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x28,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x29,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x28,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x29,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x28,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x28,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x29,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x28,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x28,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x29,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x28,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x28,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x28,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x29,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x28,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x28,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x28,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x29,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x28,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x28,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x28,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x29,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x28,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x28,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x28,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x28,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x29,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x28,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x28,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x29,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x28,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x28,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x28,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x29,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x28,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x28,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x28,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x29,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x28,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x28,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x28,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x29,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x28,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x28,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x28,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x6e ; encoding: [0x04,0x21,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6e,0x07]
+0x04,0x21,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6e,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x29,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x28,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x28,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x28,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x28,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x29,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x28,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x28,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x28,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x28,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x29,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x28,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x28,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x28,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_mul_f32 v5, -v6, v7 ; encoding: [0x08,0x31,0x8c,0xcf,0x06,0x17,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x31,0x8c,0xcf,0x06,0x17,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_max_num_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_sub_nc_u32 v5, v6, v7 ; encoding: [0x08,0x41,0x8d,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x41,0x8d,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_max_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+0x06,0x41,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x40,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x00,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+0xc1,0x90,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+0xc1,0x00,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x10,0x8d,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xa0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xb0,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+0xc1,0x80,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x70,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x30,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x50,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x60,0x8c,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x40,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x00,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+0xf0,0x90,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+0xf0,0x00,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x10,0x8d,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xa0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xb0,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+0xf0,0x80,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x70,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x30,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x50,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x60,0x8c,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x8c,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7e,0x80,0x8c,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x40,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x00,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x8c,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7e,0x90,0x8c,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7e,0x00,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x10,0x8d,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xa0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xb0,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x70,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x30,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x50,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x60,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x8c,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x68,0x80,0x8c,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x40,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x00,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x68,0x00,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x10,0x8d,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xa0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xb0,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x70,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x30,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x50,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x60,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x8c,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x80,0x8c,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x90,0x8c,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x8c,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfd,0x80,0x8c,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x40,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x00,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfd,0x90,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfd,0x00,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x10,0x8d,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xa0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xb0,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x70,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x30,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x50,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x60,0x8c,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x8c,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7a,0x80,0x8c,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x40,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x00,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7a,0x90,0x8c,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7a,0x00,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x10,0x8d,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xa0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xb0,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x70,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x30,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x50,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x60,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfe,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x41,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x01,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfe,0x91,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfe,0x01,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x11,0x8d,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xa1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xb1,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x71,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x31,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x51,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x61,0x8c,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x8c,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x81,0x8c,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x41,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x01,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x91,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x02,0x01,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x11,0x8d,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xa1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xb1,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x71,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x31,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x51,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x61,0x8c,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x8c,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x04,0x81,0x8c,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x8d,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x8d,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x8c,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x8c,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x06,0x81,0x8c,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x41,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x01,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x06,0x91,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x06,0x01,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x11,0x8d,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xa1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xb1,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x71,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x31,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x51,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x61,0x8c,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x8c,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x6a,0x80,0x8c,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x40,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x00,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x6a,0x90,0x8c,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x6a,0x00,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x10,0x8d,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xa0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xb0,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x70,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x30,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x50,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x60,0x8c,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x61,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x94 ; encoding: [0x06,0x21,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x94,0x07]
+0x06,0x21,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x94,0x07
+
+# GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+0x06,0x31,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09
+
+# GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x51,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_max_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x41,0x8d,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_min_num_f32 v0, -v1, v2 :: v_dual_sub_f32 v5, -v6, -v7 ; encoding: [0x01,0x51,0x2c,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x51,0x2c,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_min_num_f32 v0, v1, -v2 :: v_dual_add_nc_u32 v5, v6, v7 ; encoding: [0x01,0x01,0x2d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x01,0x2d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x2d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x2c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x2d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x2c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x2d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x2c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x2c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x2d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x2c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x2c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x2d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x2c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x2c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x2c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x2d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x2c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x2c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x2c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x2d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x2c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x2c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x2c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x2d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x2c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x2c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x2c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x2c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x2d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x2c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x2c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x2d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x2c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x2c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x2c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x2d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x2c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x2c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x2c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x2d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x2c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x2c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x2c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x2d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x2c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x2c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x2c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0xff ; encoding: [0x04,0x21,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0xff,0x07]
+0x04,0x21,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0xff,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x2d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x2c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x2c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x2c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x2c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x2d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x2c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x2c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x2c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x2c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x2d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x2c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x2c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x2c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[0:1], -s[8:9], v[4:5] :: v_dual_ashrrev_i32 v5, v6, v7 ; encoding: [0x08,0x60,0x91,0xcf,0x06,0x03,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x60,0x91,0xcf,0x06,0x03,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_min_num_f64 v[0:1], -v[8:9], -v[4:5] :: v_dual_subrev_f32 v5, v6, v7 ; encoding: [0x08,0x61,0x90,0xcf,0x06,0x07,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x61,0x90,0xcf,0x06,0x07,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_min_num_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x90,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+0x06,0x41,0x90,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x40,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x00,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+0xc1,0x90,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+0xc1,0x00,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x10,0x91,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xa0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xb0,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+0xc1,0x80,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x70,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x30,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x50,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x60,0x90,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x40,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x00,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+0xf0,0x90,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+0xf0,0x00,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x10,0x91,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xa0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xb0,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+0xf0,0x80,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x70,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x30,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x50,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x60,0x90,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x90,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7e,0x80,0x90,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x40,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x00,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x90,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7e,0x90,0x90,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7e,0x00,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x10,0x91,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xa0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xb0,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x70,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x30,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x50,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x60,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x90,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x68,0x80,0x90,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x40,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x00,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x68,0x00,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x10,0x91,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xa0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xb0,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x70,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x30,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x50,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x60,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x90,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x80,0x90,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x90,0x90,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x90,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfd,0x80,0x90,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x40,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x00,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfd,0x90,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfd,0x00,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x10,0x91,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xa0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xb0,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x70,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x30,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x50,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x60,0x90,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x90,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7a,0x80,0x90,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x40,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x00,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7a,0x90,0x90,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7a,0x00,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x10,0x91,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xa0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xb0,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x70,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x30,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x50,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x60,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfe,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x41,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x01,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfe,0x91,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfe,0x01,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x11,0x91,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xa1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xb1,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x71,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x31,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x51,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x61,0x90,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x90,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x81,0x90,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x41,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x01,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x91,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x02,0x01,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x11,0x91,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xa1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xb1,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x71,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x31,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x51,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x61,0x90,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x90,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x04,0x81,0x90,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x91,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x91,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x90,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x90,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x06,0x81,0x90,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x41,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x01,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x06,0x91,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x06,0x01,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x11,0x91,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xa1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xb1,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x71,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x31,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x51,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x61,0x90,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x90,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x6a,0x80,0x90,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x40,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x00,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x6a,0x90,0x90,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x6a,0x00,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x10,0x91,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xa0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xb0,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x70,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x30,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x50,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x60,0x90,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x61,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x95 ; encoding: [0x06,0x21,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x95,0x07]
+0x06,0x21,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x95,0x07
+
+# GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+0x06,0x31,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09
+
+# GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x51,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_min_num_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x41,0x91,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_mov_b32 v0, v1 :: v_dual_max_num_f32 v5, -s6, -v7 ; encoding: [0x01,0xa1,0x20,0xcf,0x06,0x30,0x00,0x00,0x00,0x07,0x00,0x05]
+0x01,0xa1,0x20,0xcf,0x06,0x30,0x00,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_f32 v7, src_scc, v4 ; encoding: [0xc1,0x40,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x40,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_add_nc_u32 v7, src_scc, v4 ; encoding: [0xc1,0x00,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x00,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_ashrrev_i32 v7, src_scc, v4 ; encoding: [0xc1,0x60,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x60,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_cndmask_b32 v7, src_scc, v4, vcc_lo ; encoding: [0xc1,0x90,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x6a,0x07]
+0xc1,0x90,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_fmac_f32 v7, src_scc, v4 ; encoding: [0xc1,0x00,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x00,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshlrev_b32 v7, src_scc, v4 ; encoding: [0xc1,0x10,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x10,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_lshrrev_b32 v7, src_scc, v4 ; encoding: [0xc1,0x50,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x50,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_i32 v7, src_scc, v4 ; encoding: [0xc1,0x70,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x70,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_max_num_f32 v7, src_scc, v4 ; encoding: [0xc1,0xa0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0xa0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_i32 v7, src_scc, v4 ; encoding: [0xc1,0x80,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x80,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_min_num_f32 v7, src_scc, v4 ; encoding: [0xc1,0xb0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0xb0,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v4 ; encoding: [0xc1,0x70,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x70,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_mul_f32 v7, src_scc, v4 ; encoding: [0xc1,0x30,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x30,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_f32 v7, src_scc, v4 ; encoding: [0xc1,0x50,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x50,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_sub_nc_u32 v7, src_scc, v4 ; encoding: [0xc1,0x40,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x40,0x21,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, -1 :: v_dual_subrev_f32 v7, src_scc, v4 ; encoding: [0xc1,0x60,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07]
+0xc1,0x60,0x20,0xcf,0xfd,0x00,0x00,0x00,0xff,0x04,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_f32 v7, 0.5, v3 ; encoding: [0xf0,0x40,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x40,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_add_nc_u32 v7, 0.5, v3 ; encoding: [0xf0,0x00,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x00,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_ashrrev_i32 v7, 0.5, v3 ; encoding: [0xf0,0x60,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x60,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_cndmask_b32 v7, 0.5, v3, vcc_lo ; encoding: [0xf0,0x90,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x6a,0x07]
+0xf0,0x90,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_fmac_f32 v7, 0.5, v3 ; encoding: [0xf0,0x00,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x00,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshlrev_b32 v7, 0.5, v3 ; encoding: [0xf0,0x10,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x10,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_lshrrev_b32 v7, 0.5, v3 ; encoding: [0xf0,0x50,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x50,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_i32 v7, 0.5, v3 ; encoding: [0xf0,0x70,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x70,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_max_num_f32 v7, 0.5, v3 ; encoding: [0xf0,0xa0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0xa0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_i32 v7, 0.5, v3 ; encoding: [0xf0,0x80,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x80,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_min_num_f32 v7, 0.5, v3 ; encoding: [0xf0,0xb0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0xb0,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v3 ; encoding: [0xf0,0x70,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x70,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_mul_f32 v7, 0.5, v3 ; encoding: [0xf0,0x30,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x30,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_f32 v7, 0.5, v3 ; encoding: [0xf0,0x50,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x50,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_sub_nc_u32 v7, 0.5, v3 ; encoding: [0xf0,0x40,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x40,0x21,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, 0.5 :: v_dual_subrev_f32 v7, 0.5, v3 ; encoding: [0xf0,0x60,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07]
+0xf0,0x60,0x20,0xcf,0xf0,0x00,0x00,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x40,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x40,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_add_nc_u32 v7, vcc_hi, v255 ; encoding: [0x7f,0x00,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x00,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_ashrrev_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x60,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x60,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_cndmask_b32 v7, exec_hi, v255, vcc_lo ; encoding: [0x7f,0x90,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x7f,0x90,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_fmac_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x00,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x00,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshlrev_b32 v7, vcc_hi, v255 ; encoding: [0x7f,0x10,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x10,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_lshrrev_b32 v7, vcc_hi, v255 ; encoding: [0x7f,0x50,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x50,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x70,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x70,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_max_num_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0xa0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0xa0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_i32 v7, vcc_hi, v255 ; encoding: [0x7f,0x80,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x80,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_min_num_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0xb0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0xb0,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x70,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x70,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_mul_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x30,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x30,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x50,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x50,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_sub_nc_u32 v7, vcc_hi, v255 ; encoding: [0x7f,0x40,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x40,0x21,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_hi :: v_dual_subrev_f32 v7, vcc_hi, v255 ; encoding: [0x7f,0x60,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7f,0x60,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x40,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x40,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_add_nc_u32 v7, ttmp15, v255 ; encoding: [0x7e,0x00,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x00,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_ashrrev_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x60,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x60,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_cndmask_b32 v7, exec_lo, v255, vcc_lo ; encoding: [0x7e,0x90,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x7e,0x90,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_fmac_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x00,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x00,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshlrev_b32 v7, ttmp15, v255 ; encoding: [0x7e,0x10,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x10,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_lshrrev_b32 v7, ttmp15, v255 ; encoding: [0x7e,0x50,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x50,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x70,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x70,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_max_num_f32 v7, ttmp15, v255 ; encoding: [0x7e,0xa0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0xa0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_i32 v7, ttmp15, v255 ; encoding: [0x7e,0x80,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x80,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_min_num_f32 v7, ttmp15, v255 ; encoding: [0x7e,0xb0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0xb0,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x70,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x70,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_mul_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x30,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x30,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x50,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x50,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_sub_nc_u32 v7, ttmp15, v255 ; encoding: [0x7e,0x40,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x40,0x21,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, exec_lo :: v_dual_subrev_f32 v7, ttmp15, v255 ; encoding: [0x7e,0x60,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7e,0x60,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_f32 v7, m0, v255 ; encoding: [0x7d,0x40,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x40,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_add_nc_u32 v7, m0, v255 ; encoding: [0x7d,0x00,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x00,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_ashrrev_i32 v7, m0, v255 ; encoding: [0x7d,0x60,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x60,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_cndmask_b32 v7, m0, v255, vcc_lo ; encoding: [0x7d,0x90,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x7d,0x90,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_fmac_f32 v7, m0, v255 ; encoding: [0x7d,0x00,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x00,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshlrev_b32 v7, m0, v255 ; encoding: [0x7d,0x10,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x10,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_lshrrev_b32 v7, m0, v255 ; encoding: [0x7d,0x50,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x50,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_i32 v7, m0, v255 ; encoding: [0x7d,0x70,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x70,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_max_num_f32 v7, m0, v255 ; encoding: [0x7d,0xa0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0xa0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_i32 v7, m0, v255 ; encoding: [0x7d,0x80,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x80,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_min_num_f32 v7, m0, v255 ; encoding: [0x7d,0xb0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0xb0,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_dx9_zero_f32 v7, m0, v255 ; encoding: [0x7d,0x70,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x70,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_mul_f32 v7, m0, v255 ; encoding: [0x7d,0x30,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x30,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_f32 v7, m0, v255 ; encoding: [0x7d,0x50,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x50,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_sub_nc_u32 v7, m0, v255 ; encoding: [0x7d,0x40,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x40,0x21,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, m0 :: v_dual_subrev_f32 v7, m0, v255 ; encoding: [0x7d,0x60,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7d,0x60,0x20,0xcf,0x7d,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_f32 v7, s105, v255 ; encoding: [0x01,0x40,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x40,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_add_nc_u32 v7, s105, v255 ; encoding: [0x01,0x00,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x00,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_ashrrev_i32 v7, s105, v255 ; encoding: [0x01,0x60,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x60,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_cndmask_b32 v7, s1, v255, vcc_lo ; encoding: [0x01,0x90,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x01,0x90,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_fmac_f32 v7, s105, v255 ; encoding: [0x01,0x00,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x00,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshlrev_b32 v7, s105, v255 ; encoding: [0x01,0x10,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x10,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_lshrrev_b32 v7, s105, v255 ; encoding: [0x01,0x50,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x50,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_i32 v7, s105, v255 ; encoding: [0x01,0x70,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x70,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_max_num_f32 v7, s105, v255 ; encoding: [0x01,0xa0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0xa0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_i32 v7, s105, v255 ; encoding: [0x01,0x80,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x80,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_min_num_f32 v7, s105, v255 ; encoding: [0x01,0xb0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0xb0,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_dx9_zero_f32 v7, s105, v255 ; encoding: [0x01,0x70,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x70,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_mul_f32 v7, s105, v255 ; encoding: [0x01,0x30,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x30,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_f32 v7, s105, v255 ; encoding: [0x01,0x50,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x50,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_sub_nc_u32 v7, s105, v255 ; encoding: [0x01,0x40,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x40,0x21,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s1 :: v_dual_subrev_f32 v7, s105, v255 ; encoding: [0x01,0x60,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x60,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_f32 v7, s1, v255 ; encoding: [0x69,0x40,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x40,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_add_nc_u32 v7, s1, v255 ; encoding: [0x69,0x00,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x00,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_ashrrev_i32 v7, s1, v255 ; encoding: [0x69,0x60,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x60,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_cndmask_b32 v7, s105, v255, vcc_lo ; encoding: [0x69,0x90,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x69,0x90,0x20,0xcf,0x69,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_fmac_f32 v7, s1, v255 ; encoding: [0x69,0x00,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x00,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshlrev_b32 v7, s1, v255 ; encoding: [0x69,0x10,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x10,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_lshrrev_b32 v7, s1, v255 ; encoding: [0x69,0x50,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x50,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_i32 v7, s1, v255 ; encoding: [0x69,0x70,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x70,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_max_num_f32 v7, s1, v255 ; encoding: [0x69,0xa0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0xa0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_i32 v7, s1, v255 ; encoding: [0x69,0x80,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x80,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_min_num_f32 v7, s1, v255 ; encoding: [0x69,0xb0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0xb0,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_dx9_zero_f32 v7, s1, v255 ; encoding: [0x69,0x70,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x70,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_mul_f32 v7, s1, v255 ; encoding: [0x69,0x30,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x30,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_f32 v7, s1, v255 ; encoding: [0x69,0x50,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x50,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_sub_nc_u32 v7, s1, v255 ; encoding: [0x69,0x40,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x40,0x21,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, s105 :: v_dual_subrev_f32 v7, s1, v255 ; encoding: [0x69,0x60,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x69,0x60,0x20,0xcf,0x01,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_f32 v7, -1, v255 ; encoding: [0xfd,0x40,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x40,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_add_nc_u32 v7, -1, v255 ; encoding: [0xfd,0x00,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x00,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_ashrrev_i32 v7, -1, v255 ; encoding: [0xfd,0x60,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x60,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_cndmask_b32 v7, -1, v255, vcc_lo ; encoding: [0xfd,0x90,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0xfd,0x90,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_fmac_f32 v7, -1, v255 ; encoding: [0xfd,0x00,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x00,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshlrev_b32 v7, -1, v255 ; encoding: [0xfd,0x10,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x10,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_lshrrev_b32 v7, -1, v255 ; encoding: [0xfd,0x50,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x50,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_i32 v7, -1, v255 ; encoding: [0xfd,0x70,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x70,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_max_num_f32 v7, -1, v255 ; encoding: [0xfd,0xa0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0xa0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_i32 v7, -1, v255 ; encoding: [0xfd,0x80,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x80,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_min_num_f32 v7, -1, v255 ; encoding: [0xfd,0xb0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0xb0,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_dx9_zero_f32 v7, -1, v255 ; encoding: [0xfd,0x70,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x70,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_mul_f32 v7, -1, v255 ; encoding: [0xfd,0x30,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x30,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_f32 v7, -1, v255 ; encoding: [0xfd,0x50,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x50,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_sub_nc_u32 v7, -1, v255 ; encoding: [0xfd,0x40,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x40,0x21,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, src_scc :: v_dual_subrev_f32 v7, -1, v255 ; encoding: [0xfd,0x60,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0xfd,0x60,0x20,0xcf,0xc1,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x40,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x40,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_add_nc_u32 v7, vcc_lo, v255 ; encoding: [0x7b,0x00,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x00,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_ashrrev_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x60,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x60,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_cndmask_b32 v7, ttmp15, v255, vcc_lo ; encoding: [0x7b,0x90,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x7b,0x90,0x20,0xcf,0x7b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_fmac_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x00,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x00,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshlrev_b32 v7, vcc_lo, v255 ; encoding: [0x7b,0x10,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x10,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_lshrrev_b32 v7, vcc_lo, v255 ; encoding: [0x7b,0x50,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x50,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x70,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x70,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_max_num_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0xa0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0xa0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_i32 v7, vcc_lo, v255 ; encoding: [0x7b,0x80,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x80,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_min_num_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0xb0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0xb0,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x70,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x70,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_mul_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x30,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x30,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x50,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x50,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_sub_nc_u32 v7, vcc_lo, v255 ; encoding: [0x7b,0x40,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x40,0x21,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, ttmp15 :: v_dual_subrev_f32 v7, vcc_lo, v255 ; encoding: [0x7b,0x60,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x7b,0x60,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_f32 v7, v255, v255 ; encoding: [0x01,0x41,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x41,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_add_nc_u32 v7, v255, v255 ; encoding: [0x01,0x01,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x01,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_ashrrev_i32 v7, v255, v255 ; encoding: [0x01,0x61,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x61,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_cndmask_b32 v7, v255, v255, vcc_lo ; encoding: [0x01,0x91,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x01,0x91,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_fmac_f32 v7, v255, v255 ; encoding: [0x01,0x01,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x01,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshlrev_b32 v7, v255, v255 ; encoding: [0x01,0x11,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x11,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_lshrrev_b32 v7, v255, v255 ; encoding: [0x01,0x51,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x51,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_i32 v7, v255, v255 ; encoding: [0x01,0x71,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x71,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_max_num_f32 v7, v255, v255 ; encoding: [0x01,0xa1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0xa1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_i32 v7, v255, v255 ; encoding: [0x01,0x81,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x81,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_min_num_f32 v7, v255, v255 ; encoding: [0x01,0xb1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0xb1,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_dx9_zero_f32 v7, v255, v255 ; encoding: [0x01,0x71,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x71,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_mul_f32 v7, v255, v255 ; encoding: [0x01,0x31,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x31,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_f32 v7, v255, v255 ; encoding: [0x01,0x51,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x51,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_sub_nc_u32 v7, v255, v255 ; encoding: [0x01,0x41,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x41,0x21,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v1 :: v_dual_subrev_f32 v7, v255, v255 ; encoding: [0x01,0x61,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x01,0x61,0x20,0xcf,0xff,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_f32 v7, v3, v255 ; encoding: [0x02,0x41,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x41,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_add_nc_u32 v7, v3, v255 ; encoding: [0x02,0x01,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x01,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_ashrrev_i32 v7, v3, v255 ; encoding: [0x02,0x61,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x61,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_cndmask_b32 v7, v3, v255, vcc_lo ; encoding: [0x02,0x91,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x02,0x91,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_fmac_f32 v7, v3, v255 ; encoding: [0x02,0x01,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x01,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshlrev_b32 v7, v3, v255 ; encoding: [0x02,0x11,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x11,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_lshrrev_b32 v7, v3, v255 ; encoding: [0x02,0x51,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x51,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_i32 v7, v3, v255 ; encoding: [0x02,0x71,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x71,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_max_num_f32 v7, v3, v255 ; encoding: [0x02,0xa1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0xa1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_i32 v7, v3, v255 ; encoding: [0x02,0x81,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x81,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_min_num_f32 v7, v3, v255 ; encoding: [0x02,0xb1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0xb1,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v255 ; encoding: [0x02,0x71,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x71,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_mul_f32 v7, v3, v255 ; encoding: [0x02,0x31,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x31,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_f32 v7, v3, v255 ; encoding: [0x02,0x51,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x51,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_sub_nc_u32 v7, v3, v255 ; encoding: [0x02,0x41,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x41,0x21,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v2 :: v_dual_subrev_f32 v7, v3, v255 ; encoding: [0x02,0x61,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x02,0x61,0x20,0xcf,0x03,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_f32 v7, v2, v255 ; encoding: [0xff,0x41,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x41,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_add_nc_u32 v7, v2, v255 ; encoding: [0xff,0x01,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x01,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_ashrrev_i32 v7, v2, v255 ; encoding: [0xff,0x61,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x61,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_cndmask_b32 v7, v2, v255, vcc_lo ; encoding: [0xff,0x91,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+0xff,0x91,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_fmac_f32 v7, v2, v255 ; encoding: [0xff,0x01,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x01,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshlrev_b32 v7, v2, v255 ; encoding: [0xff,0x11,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x11,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_lshrrev_b32 v7, v2, v255 ; encoding: [0xff,0x51,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x51,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_i32 v7, v2, v255 ; encoding: [0xff,0x71,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x71,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_max_num_f32 v7, v2, v255 ; encoding: [0xff,0xa1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0xa1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_i32 v7, v2, v255 ; encoding: [0xff,0x81,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x81,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_min_num_f32 v7, v2, v255 ; encoding: [0xff,0xb1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0xb1,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_dx9_zero_f32 v7, v2, v255 ; encoding: [0xff,0x71,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x71,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_mul_f32 v7, v2, v255 ; encoding: [0xff,0x31,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x31,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_f32 v7, v2, v255 ; encoding: [0xff,0x51,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x51,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_sub_nc_u32 v7, v2, v255 ; encoding: [0xff,0x41,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x41,0x21,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v255 :: v_dual_subrev_f32 v7, v2, v255 ; encoding: [0xff,0x61,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0xff,0x61,0x20,0xcf,0x02,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_f32 v7, v4, v255 ; encoding: [0x03,0x41,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x41,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_add_nc_u32 v7, v4, v255 ; encoding: [0x03,0x01,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x01,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_ashrrev_i32 v7, v4, v255 ; encoding: [0x03,0x61,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x61,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_cndmask_b32 v7, v4, v255, vcc_lo ; encoding: [0x03,0x91,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x03,0x91,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_fmac_f32 v7, v4, v255 ; encoding: [0x03,0x01,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x01,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshlrev_b32 v7, v4, v255 ; encoding: [0x03,0x11,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x11,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_lshrrev_b32 v7, v4, v255 ; encoding: [0x03,0x51,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x51,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_i32 v7, v4, v255 ; encoding: [0x03,0x71,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x71,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_max_num_f32 v7, v4, v255 ; encoding: [0x03,0xa1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0xa1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_i32 v7, v4, v255 ; encoding: [0x03,0x81,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x81,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_min_num_f32 v7, v4, v255 ; encoding: [0x03,0xb1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0xb1,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_dx9_zero_f32 v7, v4, v255 ; encoding: [0x03,0x71,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x71,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_mul_f32 v7, v4, v255 ; encoding: [0x03,0x31,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x31,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_f32 v7, v4, v255 ; encoding: [0x03,0x51,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x51,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_sub_nc_u32 v7, v4, v255 ; encoding: [0x03,0x41,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x41,0x21,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v3 :: v_dual_subrev_f32 v7, v4, v255 ; encoding: [0x03,0x61,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x03,0x61,0x20,0xcf,0x04,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_f32 v7, v1, v255 ; encoding: [0x04,0x41,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x41,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_add_nc_u32 v7, v1, v255 ; encoding: [0x04,0x01,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x01,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_ashrrev_i32 v7, v1, v255 ; encoding: [0x04,0x61,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x61,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0xfe ; encoding: [0x04,0x21,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0xfe,0x07]
+0x04,0x21,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0xfe,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, s96 ; encoding: [0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x60,0x07]
+0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x60,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_cndmask_b32 v7, v1, v255, vcc_lo ; encoding: [0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x04,0x91,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_fmac_f32 v7, v1, v255 ; encoding: [0x04,0x01,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x01,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshlrev_b32 v7, v1, v255 ; encoding: [0x04,0x11,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x11,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_lshrrev_b32 v7, v1, v255 ; encoding: [0x04,0x51,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x51,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_i32 v7, v1, v255 ; encoding: [0x04,0x71,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x71,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_max_num_f32 v7, v1, v255 ; encoding: [0x04,0xa1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0xa1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_i32 v7, v1, v255 ; encoding: [0x04,0x81,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x81,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_min_num_f32 v7, v1, v255 ; encoding: [0x04,0xb1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0xb1,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_dx9_zero_f32 v7, v1, v255 ; encoding: [0x04,0x71,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x71,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_mul_f32 v7, v1, v255 ; encoding: [0x04,0x31,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x31,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_f32 v7, v1, v255 ; encoding: [0x04,0x51,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x51,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_sub_nc_u32 v7, v1, v255 ; encoding: [0x04,0x41,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x41,0x21,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, v4 :: v_dual_subrev_f32 v7, v1, v255 ; encoding: [0x04,0x61,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07]
+0x04,0x61,0x20,0xcf,0x01,0x01,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x40,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x40,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_add_nc_u32 v7, exec_lo, v255 ; encoding: [0x6b,0x00,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x00,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_ashrrev_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x60,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x60,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_cndmask_b32 v7, vcc_hi, v255, vcc_lo ; encoding: [0x6b,0x90,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x6b,0x90,0x20,0xcf,0x6b,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_fmac_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x00,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x00,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshlrev_b32 v7, exec_lo, v255 ; encoding: [0x6b,0x10,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x10,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_lshrrev_b32 v7, exec_lo, v255 ; encoding: [0x6b,0x50,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x50,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x70,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x70,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_max_num_f32 v7, exec_lo, v255 ; encoding: [0x6b,0xa0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0xa0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_i32 v7, exec_lo, v255 ; encoding: [0x6b,0x80,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x80,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_min_num_f32 v7, exec_lo, v255 ; encoding: [0x6b,0xb0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0xb0,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x70,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x70,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_mul_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x30,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x30,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x50,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x50,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_sub_nc_u32 v7, exec_lo, v255 ; encoding: [0x6b,0x40,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x40,0x21,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_hi :: v_dual_subrev_f32 v7, exec_lo, v255 ; encoding: [0x6b,0x60,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6b,0x60,0x20,0xcf,0x7e,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x40,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x40,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_add_nc_u32 v7, exec_hi, v255 ; encoding: [0x6a,0x00,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x00,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_ashrrev_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x60,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x60,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_cndmask_b32 v7, vcc_lo, v255, vcc_lo ; encoding: [0x6a,0x90,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x6a,0x07]
+0x6a,0x90,0x20,0xcf,0x6a,0x00,0x00,0x00,0xff,0xff,0x6a,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_fmac_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x00,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x00,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshlrev_b32 v7, exec_hi, v255 ; encoding: [0x6a,0x10,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x10,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_lshrrev_b32 v7, exec_hi, v255 ; encoding: [0x6a,0x50,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x50,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x70,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x70,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_max_num_f32 v7, exec_hi, v255 ; encoding: [0x6a,0xa0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0xa0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_i32 v7, exec_hi, v255 ; encoding: [0x6a,0x80,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x80,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_min_num_f32 v7, exec_hi, v255 ; encoding: [0x6a,0xb0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0xb0,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x70,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x70,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_mul_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x30,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x30,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x50,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x50,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_sub_nc_u32 v7, exec_hi, v255 ; encoding: [0x6a,0x40,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x40,0x21,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v255, vcc_lo :: v_dual_subrev_f32 v7, exec_hi, v255 ; encoding: [0x6a,0x60,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07]
+0x6a,0x60,0x20,0xcf,0x7f,0x00,0x00,0x00,0xff,0xff,0x00,0x07
+
+# GFX1250: v_dual_mov_b32 v25, v8 :: v_dual_mov_b32 v13, v16 ; encoding: [0x08,0x81,0x20,0xcf,0x10,0x01,0x00,0x00,0x19,0x00,0x00,0x0d]
+0x08,0x81,0x20,0xcf,0x10,0x01,0x00,0x00,0x19,0x00,0x00,0x0d
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_fma_f32 v5, -s6, -v7, -v8 ; encoding: [0x01,0x31,0x1d,0xcf,0x06,0x72,0x02,0x00,0x00,0x07,0x08,0x05]
+0x01,0x31,0x1d,0xcf,0x06,0x72,0x02,0x00,0x00,0x07,0x08,0x05
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v0, -v1, v2 :: v_dual_lshlrev_b32 v5, v6, v7 ; encoding: [0x01,0x11,0x1d,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x11,0x1d,0xcf,0x06,0x03,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:1 ; encoding: [0x01,0x21,0x1d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x01,0x05]
+0x01,0x21,0x1d,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x01,0x05
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v0, v1, -v2 :: v_dual_mul_f32 v5, v6, -v7 ; encoding: [0x01,0x31,0x1c,0xcf,0x06,0x25,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x31,0x1c,0xcf,0x06,0x25,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x1d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x1c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x1d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x1c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x1d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x1c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x1c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x1d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x1c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x1c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x1d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x1c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x1c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x1c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x1d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x1c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x1c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x1c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x1d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x1c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x1c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x1c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x1d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x1c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x1c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x1c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x1c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x1d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x1c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x1c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x1d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x1c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x1c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x1c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x1d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x1c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x1c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x1c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x1d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x1c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x1c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x1c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x1d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x1c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x1c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x1c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x11 ; encoding: [0x04,0x21,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x11,0x07]
+0x04,0x21,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x11,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x1d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x1c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x1c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x1c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x1c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x1d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x1c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x1c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x1c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x1c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x1d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x1c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_dx9_zero_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x1c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x1c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v0, -v1, -v2 :: v_dual_bitop2_b32 v5, v6, v7 bitop3:0x64 ; encoding: [0x01,0x21,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x64,0x05]
+0x01,0x21,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x64,0x05
+
+# GFX1250: v_dual_mul_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7 ; encoding: [0x01,0x51,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x51,0x0d,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_f32 v0, v1, v2 :: v_dual_fmac_f32 v5, -v6, -v7 ; encoding: [0x01,0x01,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x01,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_f32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, -v6, -v7 ; encoding: [0x01,0x71,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x71,0x0c,0xcf,0x06,0x31,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x0d,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x0c,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x0d,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x0c,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x0d,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x0c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x0c,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x0d,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x0c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x0c,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x0d,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x0c,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x0c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x0c,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x0d,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x0c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x0c,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x0c,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x0d,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x0c,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x0c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x0c,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x0d,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x0c,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x0c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x0c,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x0c,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x0d,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x0c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x0c,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x0d,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x0c,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x0c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x0c,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x0d,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x0c,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x0c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x0c,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x0d,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x0c,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x0c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x0c,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x0d,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x0c,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x0c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x0c,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x71 ; encoding: [0x04,0x21,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x71,0x07]
+0x04,0x21,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x71,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x0d,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x0c,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x0c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x0c,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x0c,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x0d,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x0c,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x0c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x0c,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x0c,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x0d,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x0c,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x0c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x0c,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[0:1], -v[8:9], v[4:5] :: v_dual_add_f32 v5, -v6, v7 ; encoding: [0x08,0x41,0x88,0xcf,0x06,0x13,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x41,0x88,0xcf,0x06,0x13,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_f64 v[0:1], v[8:9], -v[4:5] :: v_dual_lshlrev_b32 v5, v6, v7 ; encoding: [0x08,0x11,0x89,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05]
+0x08,0x11,0x89,0xcf,0x06,0x05,0x04,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_mul_f64 v[252:253], v[6:7], v[4:5] :: v_dual_add_f32 v8, v1, v3 ; encoding: [0x06,0x41,0x88,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08]
+0x06,0x41,0x88,0xcf,0x01,0x01,0x04,0x00,0xfc,0x03,0x00,0x08
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x40,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x00,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07]
+0xc1,0x90,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_fmac_f32 v9, src_scc, v5 ; encoding: [0xc1,0x00,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09]
+0xc1,0x00,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x10,0x89,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xa0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0xb0,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07]
+0xc1,0x80,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x70,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x30,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x50,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], -1, v[6:7] :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07]
+0xc1,0x60,0x88,0xcf,0xfd,0x00,0x06,0x00,0xfe,0x05,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x40,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x00,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07]
+0xf0,0x90,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_fmac_f32 v9, 0.5, v2 ; encoding: [0xf0,0x00,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09]
+0xf0,0x00,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x10,0x89,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xa0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0xb0,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07]
+0xf0,0x80,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x70,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x30,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x50,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], 0.5, v[8:9] :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07]
+0xf0,0x60,0x88,0xcf,0xf0,0x00,0x08,0x00,0xfe,0x02,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[254:255] :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x88,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7e,0x80,0x88,0xcf,0x7b,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_f32 v7, v15, v3 ; encoding: [0x7e,0x40,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x40,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_add_nc_u32 v7, v15, v3 ; encoding: [0x7e,0x00,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x00,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_cndmask_b32 v7, v10, v3, vcc_lo ; encoding: [0x7e,0x90,0x88,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7e,0x90,0x88,0xcf,0x0a,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_fmac_f32 v9, v15, v3 ; encoding: [0x7e,0x00,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7e,0x00,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_lshlrev_b32 v7, v15, v3 ; encoding: [0x7e,0x10,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x10,0x89,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_max_num_f32 v7, v15, v3 ; encoding: [0x7e,0xa0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xa0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_min_num_f32 v7, v15, v3 ; encoding: [0x7e,0xb0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0xb0,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v15, v3 ; encoding: [0x7e,0x70,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x70,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_mul_f32 v7, v15, v3 ; encoding: [0x7e,0x30,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x30,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_sub_f32 v7, v15, v3 ; encoding: [0x7e,0x50,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x50,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], exec, v[4:5] :: v_dual_subrev_f32 v7, v15, v3 ; encoding: [0x7e,0x60,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7e,0x60,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[254:255] :: v_dual_mov_b32 v7, s1 ; encoding: [0x68,0x80,0x88,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x68,0x80,0x88,0xcf,0x01,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x68,0x40,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x40,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x68,0x00,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x00,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_fmac_f32 v9, v1, v3 ; encoding: [0x68,0x00,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x68,0x00,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x68,0x10,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x10,0x89,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x68,0xa0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xa0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x68,0xb0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0xb0,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x68,0x70,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x70,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x68,0x30,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x30,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x68,0x50,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x50,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[104:105], v[4:5] :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x68,0x60,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x68,0x60,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[2:3], v[254:255] :: v_dual_mov_b32 v7, s105 ; encoding: [0x02,0x80,0x88,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x80,0x88,0xcf,0x69,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], s[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x02,0x90,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x90,0x88,0xcf,0x01,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[254:255] :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x88,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfd,0x80,0x88,0xcf,0xc1,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x40,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x00,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfd,0x90,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_fmac_f32 v9, -1, v3 ; encoding: [0xfd,0x00,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfd,0x00,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x10,0x89,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xa0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0xb0,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x70,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x30,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x50,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], src_scc, v[4:5] :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfd,0x60,0x88,0xcf,0xc1,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[254:255] :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7a,0x80,0x88,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x7a,0x80,0x88,0xcf,0x6a,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x40,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x40,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7a,0x00,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x00,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_cndmask_b32 v7, v15, v3, vcc_lo ; encoding: [0x7a,0x90,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x7a,0x90,0x88,0xcf,0x0f,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_fmac_f32 v9, vcc_lo, v3 ; encoding: [0x7a,0x00,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x7a,0x00,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7a,0x10,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x10,0x89,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xa0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xa0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0xb0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0xb0,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x70,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x70,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x30,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x30,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x50,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x50,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], ttmp[14:15], v[4:5] :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7a,0x60,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x7a,0x60,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0xfe,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0xfe,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_f32 v7, v5, v3 ; encoding: [0xfe,0x41,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x41,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_add_nc_u32 v7, v5, v3 ; encoding: [0xfe,0x01,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x01,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_cndmask_b32 v7, v5, v3, vcc_lo ; encoding: [0xfe,0x91,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0xfe,0x91,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_fmac_f32 v9, v5, v3 ; encoding: [0xfe,0x01,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0xfe,0x01,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_lshlrev_b32 v7, v5, v3 ; encoding: [0xfe,0x11,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x11,0x89,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_max_num_f32 v7, v5, v3 ; encoding: [0xfe,0xa1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xa1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_min_num_f32 v7, v5, v3 ; encoding: [0xfe,0xb1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0xb1,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v5, v3 ; encoding: [0xfe,0x71,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x71,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_mul_f32 v7, v5, v3 ; encoding: [0xfe,0x31,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x31,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_sub_f32 v7, v5, v3 ; encoding: [0xfe,0x51,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x51,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[254:255], v[4:5] :: v_dual_subrev_f32 v7, v5, v3 ; encoding: [0xfe,0x61,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0xfe,0x61,0x88,0xcf,0x05,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[254:255] :: v_dual_mov_b32 v7, v253 ; encoding: [0x02,0x81,0x88,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x02,0x81,0x88,0xcf,0xfd,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_f32 v7, v253, v3 ; encoding: [0x02,0x41,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x41,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_add_nc_u32 v7, v253, v3 ; encoding: [0x02,0x01,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x01,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_cndmask_b32 v7, v253, v3, vcc_lo ; encoding: [0x02,0x91,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x02,0x91,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_fmac_f32 v9, v253, v3 ; encoding: [0x02,0x01,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x02,0x01,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_lshlrev_b32 v7, v253, v3 ; encoding: [0x02,0x11,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x11,0x89,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_max_num_f32 v7, v253, v3 ; encoding: [0x02,0xa1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xa1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_min_num_f32 v7, v253, v3 ; encoding: [0x02,0xb1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0xb1,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v253, v3 ; encoding: [0x02,0x71,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x71,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_mul_f32 v7, v253, v3 ; encoding: [0x02,0x31,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x31,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_sub_f32 v7, v253, v3 ; encoding: [0x02,0x51,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x51,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[2:3], v[4:5] :: v_dual_subrev_f32 v7, v253, v3 ; encoding: [0x02,0x61,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x02,0x61,0x88,0xcf,0xfd,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[254:255] :: v_dual_mov_b32 v7, v3 ; encoding: [0x04,0x81,0x88,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x04,0x81,0x88,0xcf,0x03,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x04,0x41,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0x04,0x01,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x04,0x01,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x01,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, s96 ; encoding: [0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07]
+0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x60,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x04,0x91,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x04,0x91,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v2, v3 ; encoding: [0x04,0x01,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_fmac_f32 v9, v3, v3 ; encoding: [0x04,0x01,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x04,0x01,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0x04,0x11,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x89,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x04,0x11,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x11,0x89,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0x04,0xa1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x04,0xa1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xa1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0x04,0xb1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x04,0xb1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0xb1,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0x04,0x71,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x04,0x71,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x71,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0x04,0x31,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x04,0x31,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x31,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0x04,0x51,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x04,0x51,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x51,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0x04,0x61,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x88,0xcf,0x02,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[4:5], v[4:5] :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x04,0x61,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x61,0x88,0xcf,0x03,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[254:255] :: v_dual_mov_b32 v7, v4 ; encoding: [0x06,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x06,0x81,0x88,0xcf,0x04,0x01,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x06,0x41,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x41,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x06,0x01,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x01,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x06,0x91,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x06,0x91,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_fmac_f32 v9, v4, v3 ; encoding: [0x06,0x01,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x06,0x01,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x06,0x11,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x11,0x89,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x06,0xa1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xa1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x06,0xb1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0xb1,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x06,0x71,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x71,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x06,0x31,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x31,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x06,0x51,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x51,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], v[6:7], v[4:5] :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x06,0x61,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x06,0x61,0x88,0xcf,0x04,0x01,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[254:255] :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x88,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07]
+0x6a,0x80,0x88,0xcf,0x7f,0x00,0xfe,0x00,0xfe,0x00,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x40,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x00,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07]
+0x6a,0x90,0x88,0xcf,0x6a,0x00,0x04,0x00,0xfe,0x03,0x6a,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_fmac_f32 v9, exec_hi, v3 ; encoding: [0x6a,0x00,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09]
+0x6a,0x00,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x10,0x89,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xa0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0xb0,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x70,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x30,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x50,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[254:255], vcc, v[4:5] :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07]
+0x6a,0x60,0x88,0xcf,0x7f,0x00,0x04,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_ashrrev_i32 v9, v1, v14 ; encoding: [0x06,0x61,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x61,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x93 ; encoding: [0x06,0x21,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x93,0x07]
+0x06,0x21,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x03,0x93,0x07
+
+# GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_fma_f32 v9, v1, v14, v4 ; encoding: [0x06,0x31,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09]
+0x06,0x31,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x04,0x09
+
+# GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_lshrrev_b32 v9, v1, v14 ; encoding: [0x06,0x51,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x51,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_mul_f64 v[2:3], v[6:7], v[4:5] :: v_dual_sub_nc_u32 v9, v1, v14 ; encoding: [0x06,0x41,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09]
+0x06,0x41,0x89,0xcf,0x01,0x01,0x04,0x00,0x02,0x0e,0x00,0x09
+
+# GFX1250: v_dual_sub_f32 v0, -v1, -v2 :: v_dual_lshrrev_b32 v5, v6, v7 ; encoding: [0x01,0x51,0x15,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x51,0x15,0xcf,0x06,0x07,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_sub_f32 v0, -v1, v2 :: v_dual_mul_f32 v5, -v6, -v7 ; encoding: [0x01,0x31,0x14,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x31,0x14,0xcf,0x06,0x33,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_sub_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7 ; encoding: [0x01,0x71,0x15,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x71,0x15,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_sub_f32 v0, v1, v2 :: v_dual_min_num_f32 v5, v6, -v7 ; encoding: [0x01,0xb1,0x14,0xcf,0x06,0x21,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0xb1,0x14,0xcf,0x06,0x21,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x15,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x14,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x15,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x14,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x15,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x14,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x14,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x15,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x14,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x14,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x15,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x14,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x14,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x14,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x15,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x14,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x14,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x14,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x15,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x14,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x14,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x14,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x15,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x14,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x14,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x14,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x14,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x15,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x14,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x14,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x15,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x14,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x14,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x14,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x15,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x14,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x14,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x14,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x15,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x14,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x14,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x14,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x15,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x14,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x14,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x14,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x82 ; encoding: [0x04,0x21,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x82,0x07]
+0x04,0x21,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x82,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x15,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x14,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x14,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x14,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x14,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x15,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x14,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x14,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x14,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x14,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x15,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x14,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x14,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x14,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v0, v1, v2 :: v_dual_mul_dx9_zero_f32 v5, v6, v7 ; encoding: [0x01,0x71,0x50,0xcf,0x06,0x01,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x71,0x50,0xcf,0x06,0x01,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x50,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07]
+0x04,0x41,0x50,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_ashrrev_i32 v9, v1, v13 ; encoding: [0x04,0x61,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x61,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x86 ; encoding: [0x04,0x21,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x86,0x07]
+0x04,0x21,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x86,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07]
+0x04,0x31,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x03,0x04,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_lshrrev_b32 v9, v1, v13 ; encoding: [0x04,0x51,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x51,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_sub_nc_u32 v254, v4, v2 :: v_dual_sub_nc_u32 v9, v1, v13 ; encoding: [0x04,0x41,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09]
+0x04,0x41,0x51,0xcf,0x01,0x01,0x02,0x00,0xfe,0x0d,0x00,0x09
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x51,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x50,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x51,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x50,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x51,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x50,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x50,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x51,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x50,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x50,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x51,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x50,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x50,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x50,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x51,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x50,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x50,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x50,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x51,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x50,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x50,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x50,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x51,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x50,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x50,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x50,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x50,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x51,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x50,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x50,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x51,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x50,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x50,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x50,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x51,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x50,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x50,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x50,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x51,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x50,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x50,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x50,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x51,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x50,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x50,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x50,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x51,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x50,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x50,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x50,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x50,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x51,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x50,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x50,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x50,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x50,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x51,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x50,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_sub_nc_u32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x50,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x50,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v0, -s1, -v2 :: v_dual_mul_f32 v5, -s6, -v7 ; encoding: [0x01,0x30,0x18,0xcf,0x06,0x36,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x30,0x18,0xcf,0x06,0x36,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_max_i32 v5, v6, v7 ; encoding: [0x01,0x71,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x71,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_min_i32 v5, v6, v7 ; encoding: [0x01,0x81,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x81,0x19,0xcf,0x06,0x05,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_subrev_f32 v0, v1, -v2 :: v_dual_sub_f32 v5, -v6, v7 ; encoding: [0x01,0x51,0x18,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05]
+0x01,0x51,0x18,0xcf,0x06,0x15,0x02,0x00,0x00,0x07,0x00,0x05
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_f32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_add_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_ashrrev_i32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_cndmask_b32 v7, src_scc, v5, vcc_lo ; encoding: [0xc1,0x90,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07]
+0xc1,0x90,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_fmac_f32 v7, src_scc, v5 ; encoding: [0xc1,0x00,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x00,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshlrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x10,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x10,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_lshrrev_b32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_i32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_max_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xa0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xa0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_i32 v7, src_scc, v5 ; encoding: [0xc1,0x80,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x80,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_min_num_f32 v7, src_scc, v5 ; encoding: [0xc1,0xb0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0xb0,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mov_b32 v7, src_scc ; encoding: [0xc1,0x80,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07]
+0xc1,0x80,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_dx9_zero_f32 v7, src_scc, v5 ; encoding: [0xc1,0x70,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x70,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_mul_f32 v7, src_scc, v5 ; encoding: [0xc1,0x30,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x30,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_f32 v7, src_scc, v5 ; encoding: [0xc1,0x50,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x50,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_sub_nc_u32 v7, src_scc, v5 ; encoding: [0xc1,0x40,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x40,0x19,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, -1, v4 :: v_dual_subrev_f32 v7, src_scc, v5 ; encoding: [0xc1,0x60,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07]
+0xc1,0x60,0x18,0xcf,0xfd,0x00,0x04,0x00,0xff,0x05,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_f32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_add_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_ashrrev_i32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_cndmask_b32 v7, 0.5, v2, vcc_lo ; encoding: [0xf0,0x90,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07]
+0xf0,0x90,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_fmac_f32 v7, 0.5, v2 ; encoding: [0xf0,0x00,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x00,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshlrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x10,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x10,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_lshrrev_b32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_i32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_max_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xa0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xa0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_i32 v7, 0.5, v2 ; encoding: [0xf0,0x80,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x80,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_min_num_f32 v7, 0.5, v2 ; encoding: [0xf0,0xb0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0xb0,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mov_b32 v7, 0.5 ; encoding: [0xf0,0x80,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07]
+0xf0,0x80,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_dx9_zero_f32 v7, 0.5, v2 ; encoding: [0xf0,0x70,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x70,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_mul_f32 v7, 0.5, v2 ; encoding: [0xf0,0x30,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x30,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_f32 v7, 0.5, v2 ; encoding: [0xf0,0x50,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x50,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_sub_nc_u32 v7, 0.5, v2 ; encoding: [0xf0,0x40,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x40,0x19,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, 0.5, v3 :: v_dual_subrev_f32 v7, 0.5, v2 ; encoding: [0xf0,0x60,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07]
+0xf0,0x60,0x18,0xcf,0xf0,0x00,0x03,0x00,0xff,0x02,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_add_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_ashrrev_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_cndmask_b32 v7, exec_hi, v3, vcc_lo ; encoding: [0x7f,0x90,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7f,0x90,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_fmac_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x00,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x00,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshlrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x10,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x10,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_lshrrev_b32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_max_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xa0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xa0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_i32 v7, vcc_hi, v3 ; encoding: [0x7f,0x80,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x80,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_min_num_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0xb0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0xb0,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x70,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x70,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_mul_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x30,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x30,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x50,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x50,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_sub_nc_u32 v7, vcc_hi, v3 ; encoding: [0x7f,0x40,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x40,0x19,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v2 :: v_dual_subrev_f32 v7, vcc_hi, v3 ; encoding: [0x7f,0x60,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7f,0x60,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_hi, v255 :: v_dual_mov_b32 v7, vcc_hi ; encoding: [0x7f,0x80,0x18,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7f,0x80,0x18,0xcf,0x6b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_add_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_ashrrev_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_cndmask_b32 v7, exec_lo, v3, vcc_lo ; encoding: [0x7e,0x90,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7e,0x90,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_fmac_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x00,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x00,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshlrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x10,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x10,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_lshrrev_b32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_max_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xa0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xa0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_i32 v7, ttmp15, v3 ; encoding: [0x7e,0x80,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x80,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_min_num_f32 v7, ttmp15, v3 ; encoding: [0x7e,0xb0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0xb0,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x70,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x70,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_mul_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x30,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x30,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x50,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x50,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_sub_nc_u32 v7, ttmp15, v3 ; encoding: [0x7e,0x40,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x40,0x19,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v2 :: v_dual_subrev_f32 v7, ttmp15, v3 ; encoding: [0x7e,0x60,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7e,0x60,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, exec_lo, v255 :: v_dual_mov_b32 v7, ttmp15 ; encoding: [0x7e,0x80,0x18,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7e,0x80,0x18,0xcf,0x7b,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_f32 v7, m0, v3 ; encoding: [0x7d,0x40,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_add_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x00,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_ashrrev_i32 v7, m0, v3 ; encoding: [0x7d,0x60,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_cndmask_b32 v7, m0, v3, vcc_lo ; encoding: [0x7d,0x90,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7d,0x90,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_fmac_f32 v7, m0, v3 ; encoding: [0x7d,0x00,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x00,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshlrev_b32 v7, m0, v3 ; encoding: [0x7d,0x10,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x10,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_lshrrev_b32 v7, m0, v3 ; encoding: [0x7d,0x50,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_i32 v7, m0, v3 ; encoding: [0x7d,0x70,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_max_num_f32 v7, m0, v3 ; encoding: [0x7d,0xa0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xa0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_i32 v7, m0, v3 ; encoding: [0x7d,0x80,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x80,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_min_num_f32 v7, m0, v3 ; encoding: [0x7d,0xb0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0xb0,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_dx9_zero_f32 v7, m0, v3 ; encoding: [0x7d,0x70,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x70,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_mul_f32 v7, m0, v3 ; encoding: [0x7d,0x30,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x30,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_f32 v7, m0, v3 ; encoding: [0x7d,0x50,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x50,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_sub_nc_u32 v7, m0, v3 ; encoding: [0x7d,0x40,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x40,0x19,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v2 :: v_dual_subrev_f32 v7, m0, v3 ; encoding: [0x7d,0x60,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7d,0x60,0x18,0xcf,0x7d,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, m0, v255 :: v_dual_mov_b32 v7, m0 ; encoding: [0x7d,0x80,0x18,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7d,0x80,0x18,0xcf,0x7d,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_f32 v7, s105, v3 ; encoding: [0x01,0x40,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_add_nc_u32 v7, s105, v3 ; encoding: [0x01,0x00,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_ashrrev_i32 v7, s105, v3 ; encoding: [0x01,0x60,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_cndmask_b32 v7, s1, v3, vcc_lo ; encoding: [0x01,0x90,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x90,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_fmac_f32 v7, s105, v3 ; encoding: [0x01,0x00,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x00,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshlrev_b32 v7, s105, v3 ; encoding: [0x01,0x10,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x10,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_lshrrev_b32 v7, s105, v3 ; encoding: [0x01,0x50,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_i32 v7, s105, v3 ; encoding: [0x01,0x70,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_max_num_f32 v7, s105, v3 ; encoding: [0x01,0xa0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_i32 v7, s105, v3 ; encoding: [0x01,0x80,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x80,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_min_num_f32 v7, s105, v3 ; encoding: [0x01,0xb0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb0,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_dx9_zero_f32 v7, s105, v3 ; encoding: [0x01,0x70,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x70,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_mul_f32 v7, s105, v3 ; encoding: [0x01,0x30,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x30,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_f32 v7, s105, v3 ; encoding: [0x01,0x50,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x50,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_sub_nc_u32 v7, s105, v3 ; encoding: [0x01,0x40,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x40,0x19,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v2 :: v_dual_subrev_f32 v7, s105, v3 ; encoding: [0x01,0x60,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x60,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s1, v255 :: v_dual_mov_b32 v7, s105 ; encoding: [0x01,0x80,0x18,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x80,0x18,0xcf,0x69,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_f32 v7, s1, v3 ; encoding: [0x69,0x40,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_add_nc_u32 v7, s1, v3 ; encoding: [0x69,0x00,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_ashrrev_i32 v7, s1, v3 ; encoding: [0x69,0x60,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_cndmask_b32 v7, s105, v3, vcc_lo ; encoding: [0x69,0x90,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x69,0x90,0x18,0xcf,0x69,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_fmac_f32 v7, s1, v3 ; encoding: [0x69,0x00,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x00,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshlrev_b32 v7, s1, v3 ; encoding: [0x69,0x10,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x10,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_lshrrev_b32 v7, s1, v3 ; encoding: [0x69,0x50,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_i32 v7, s1, v3 ; encoding: [0x69,0x70,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_max_num_f32 v7, s1, v3 ; encoding: [0x69,0xa0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xa0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_i32 v7, s1, v3 ; encoding: [0x69,0x80,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x80,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_min_num_f32 v7, s1, v3 ; encoding: [0x69,0xb0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0xb0,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_dx9_zero_f32 v7, s1, v3 ; encoding: [0x69,0x70,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x70,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_mul_f32 v7, s1, v3 ; encoding: [0x69,0x30,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x30,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_f32 v7, s1, v3 ; encoding: [0x69,0x50,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x50,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_sub_nc_u32 v7, s1, v3 ; encoding: [0x69,0x40,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x40,0x19,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v2 :: v_dual_subrev_f32 v7, s1, v3 ; encoding: [0x69,0x60,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x69,0x60,0x18,0xcf,0x01,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, s105, v255 :: v_dual_mov_b32 v7, s1 ; encoding: [0x69,0x80,0x18,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x69,0x80,0x18,0xcf,0x01,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_f32 v7, -1, v3 ; encoding: [0xfd,0x40,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_add_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x00,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_ashrrev_i32 v7, -1, v3 ; encoding: [0xfd,0x60,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_cndmask_b32 v7, -1, v3, vcc_lo ; encoding: [0xfd,0x90,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xfd,0x90,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_fmac_f32 v7, -1, v3 ; encoding: [0xfd,0x00,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x00,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshlrev_b32 v7, -1, v3 ; encoding: [0xfd,0x10,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x10,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_lshrrev_b32 v7, -1, v3 ; encoding: [0xfd,0x50,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_i32 v7, -1, v3 ; encoding: [0xfd,0x70,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_max_num_f32 v7, -1, v3 ; encoding: [0xfd,0xa0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xa0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_i32 v7, -1, v3 ; encoding: [0xfd,0x80,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x80,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_min_num_f32 v7, -1, v3 ; encoding: [0xfd,0xb0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0xb0,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_dx9_zero_f32 v7, -1, v3 ; encoding: [0xfd,0x70,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x70,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_mul_f32 v7, -1, v3 ; encoding: [0xfd,0x30,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x30,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_f32 v7, -1, v3 ; encoding: [0xfd,0x50,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x50,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_sub_nc_u32 v7, -1, v3 ; encoding: [0xfd,0x40,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x40,0x19,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v2 :: v_dual_subrev_f32 v7, -1, v3 ; encoding: [0xfd,0x60,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0xfd,0x60,0x18,0xcf,0xc1,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, src_scc, v255 :: v_dual_mov_b32 v7, -1 ; encoding: [0xfd,0x80,0x18,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0xfd,0x80,0x18,0xcf,0xc1,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_add_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_ashrrev_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_cndmask_b32 v7, ttmp15, v3, vcc_lo ; encoding: [0x7b,0x90,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x7b,0x90,0x18,0xcf,0x7b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_fmac_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x00,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x00,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshlrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x10,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x10,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_lshrrev_b32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_max_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xa0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xa0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_i32 v7, vcc_lo, v3 ; encoding: [0x7b,0x80,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x80,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_min_num_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0xb0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0xb0,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_dx9_zero_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x70,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x70,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_mul_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x30,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x30,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x50,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x50,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_sub_nc_u32 v7, vcc_lo, v3 ; encoding: [0x7b,0x40,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x40,0x19,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v2 :: v_dual_subrev_f32 v7, vcc_lo, v3 ; encoding: [0x7b,0x60,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x7b,0x60,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, ttmp15, v255 :: v_dual_mov_b32 v7, vcc_lo ; encoding: [0x7b,0x80,0x18,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x7b,0x80,0x18,0xcf,0x6a,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_f32 v7, v255, v3 ; encoding: [0x01,0x41,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_add_nc_u32 v7, v255, v3 ; encoding: [0x01,0x01,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_ashrrev_i32 v7, v255, v3 ; encoding: [0x01,0x61,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_cndmask_b32 v7, v255, v3, vcc_lo ; encoding: [0x01,0x91,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x01,0x91,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_fmac_f32 v7, v255, v3 ; encoding: [0x01,0x01,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x01,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshlrev_b32 v7, v255, v3 ; encoding: [0x01,0x11,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x11,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_lshrrev_b32 v7, v255, v3 ; encoding: [0x01,0x51,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_i32 v7, v255, v3 ; encoding: [0x01,0x71,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_max_num_f32 v7, v255, v3 ; encoding: [0x01,0xa1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xa1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_i32 v7, v255, v3 ; encoding: [0x01,0x81,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x81,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_min_num_f32 v7, v255, v3 ; encoding: [0x01,0xb1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0xb1,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_dx9_zero_f32 v7, v255, v3 ; encoding: [0x01,0x71,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x71,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_mul_f32 v7, v255, v3 ; encoding: [0x01,0x31,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x31,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_f32 v7, v255, v3 ; encoding: [0x01,0x51,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x51,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_sub_nc_u32 v7, v255, v3 ; encoding: [0x01,0x41,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x41,0x19,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v2 :: v_dual_subrev_f32 v7, v255, v3 ; encoding: [0x01,0x61,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x01,0x61,0x18,0xcf,0xff,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v1, v255 :: v_dual_mov_b32 v7, v255 ; encoding: [0x01,0x81,0x18,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x01,0x81,0x18,0xcf,0xff,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_f32 v7, v3, v3 ; encoding: [0x02,0x41,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_add_nc_u32 v7, v3, v3 ; encoding: [0x02,0x01,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_ashrrev_i32 v7, v3, v3 ; encoding: [0x02,0x61,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_cndmask_b32 v7, v3, v3, vcc_lo ; encoding: [0x02,0x91,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x02,0x91,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_fmac_f32 v7, v3, v3 ; encoding: [0x02,0x01,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x01,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshlrev_b32 v7, v3, v3 ; encoding: [0x02,0x11,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x11,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_lshrrev_b32 v7, v3, v3 ; encoding: [0x02,0x51,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_i32 v7, v3, v3 ; encoding: [0x02,0x71,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_max_num_f32 v7, v3, v3 ; encoding: [0x02,0xa1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xa1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_i32 v7, v3, v3 ; encoding: [0x02,0x81,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x81,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_min_num_f32 v7, v3, v3 ; encoding: [0x02,0xb1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0xb1,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_dx9_zero_f32 v7, v3, v3 ; encoding: [0x02,0x71,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x71,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_mul_f32 v7, v3, v3 ; encoding: [0x02,0x31,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x31,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_f32 v7, v3, v3 ; encoding: [0x02,0x51,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x51,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_sub_nc_u32 v7, v3, v3 ; encoding: [0x02,0x41,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x41,0x19,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v2 :: v_dual_subrev_f32 v7, v3, v3 ; encoding: [0x02,0x61,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x02,0x61,0x18,0xcf,0x03,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v2, v255 :: v_dual_mov_b32 v7, v3 ; encoding: [0x02,0x81,0x18,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x02,0x81,0x18,0xcf,0x03,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_f32 v7, v2, v3 ; encoding: [0xff,0x41,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_add_nc_u32 v7, v2, v3 ; encoding: [0xff,0x01,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_ashrrev_i32 v7, v2, v3 ; encoding: [0xff,0x61,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_cndmask_b32 v7, v2, v3, vcc_lo ; encoding: [0xff,0x91,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0xff,0x91,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_fmac_f32 v7, v2, v3 ; encoding: [0xff,0x01,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x01,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshlrev_b32 v7, v2, v3 ; encoding: [0xff,0x11,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x11,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_lshrrev_b32 v7, v2, v3 ; encoding: [0xff,0x51,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_i32 v7, v2, v3 ; encoding: [0xff,0x71,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_max_num_f32 v7, v2, v3 ; encoding: [0xff,0xa1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xa1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_i32 v7, v2, v3 ; encoding: [0xff,0x81,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x81,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_min_num_f32 v7, v2, v3 ; encoding: [0xff,0xb1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0xb1,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_dx9_zero_f32 v7, v2, v3 ; encoding: [0xff,0x71,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x71,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_mul_f32 v7, v2, v3 ; encoding: [0xff,0x31,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x31,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_f32 v7, v2, v3 ; encoding: [0xff,0x51,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x51,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_sub_nc_u32 v7, v2, v3 ; encoding: [0xff,0x41,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x41,0x19,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v2 :: v_dual_subrev_f32 v7, v2, v3 ; encoding: [0xff,0x61,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0xff,0x61,0x18,0xcf,0x02,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v255, v255 :: v_dual_mov_b32 v7, v2 ; encoding: [0xff,0x81,0x18,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0xff,0x81,0x18,0xcf,0x02,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_f32 v7, v4, v3 ; encoding: [0x03,0x41,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_add_nc_u32 v7, v4, v3 ; encoding: [0x03,0x01,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_ashrrev_i32 v7, v4, v3 ; encoding: [0x03,0x61,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_cndmask_b32 v7, v4, v3, vcc_lo ; encoding: [0x03,0x91,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x03,0x91,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_fmac_f32 v7, v4, v3 ; encoding: [0x03,0x01,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x01,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshlrev_b32 v7, v4, v3 ; encoding: [0x03,0x11,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x11,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_lshrrev_b32 v7, v4, v3 ; encoding: [0x03,0x51,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_i32 v7, v4, v3 ; encoding: [0x03,0x71,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_max_num_f32 v7, v4, v3 ; encoding: [0x03,0xa1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xa1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_i32 v7, v4, v3 ; encoding: [0x03,0x81,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x81,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_min_num_f32 v7, v4, v3 ; encoding: [0x03,0xb1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0xb1,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_dx9_zero_f32 v7, v4, v3 ; encoding: [0x03,0x71,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x71,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_mul_f32 v7, v4, v3 ; encoding: [0x03,0x31,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x31,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_f32 v7, v4, v3 ; encoding: [0x03,0x51,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x51,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_sub_nc_u32 v7, v4, v3 ; encoding: [0x03,0x41,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x41,0x19,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v2 :: v_dual_subrev_f32 v7, v4, v3 ; encoding: [0x03,0x61,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x03,0x61,0x18,0xcf,0x04,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v3, v255 :: v_dual_mov_b32 v7, v4 ; encoding: [0x03,0x81,0x18,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x03,0x81,0x18,0xcf,0x04,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_f32 v7, v1, v3 ; encoding: [0x04,0x41,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_add_nc_u32 v7, v1, v3 ; encoding: [0x04,0x01,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_ashrrev_i32 v7, v1, v3 ; encoding: [0x04,0x61,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_bitop2_b32 v7, v1, v3 bitop3:0x83 ; encoding: [0x04,0x21,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x83,0x07]
+0x04,0x21,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x83,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, s96 ; encoding: [0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07]
+0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x60,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_cndmask_b32 v7, v1, v3, vcc_lo ; encoding: [0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x04,0x91,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fma_f32 v7, v1, v3, v4 ; encoding: [0x04,0x31,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07]
+0x04,0x31,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x04,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_fmac_f32 v7, v1, v3 ; encoding: [0x04,0x01,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x01,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshlrev_b32 v7, v1, v3 ; encoding: [0x04,0x11,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x11,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_lshrrev_b32 v7, v1, v3 ; encoding: [0x04,0x51,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_i32 v7, v1, v3 ; encoding: [0x04,0x71,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_max_num_f32 v7, v1, v3 ; encoding: [0x04,0xa1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xa1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_i32 v7, v1, v3 ; encoding: [0x04,0x81,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x81,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_min_num_f32 v7, v1, v3 ; encoding: [0x04,0xb1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0xb1,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_dx9_zero_f32 v7, v1, v3 ; encoding: [0x04,0x71,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x71,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_mul_f32 v7, v1, v3 ; encoding: [0x04,0x31,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x31,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_f32 v7, v1, v3 ; encoding: [0x04,0x51,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x51,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_sub_nc_u32 v7, v1, v3 ; encoding: [0x04,0x41,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x41,0x19,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v2 :: v_dual_subrev_f32 v7, v1, v3 ; encoding: [0x04,0x61,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07]
+0x04,0x61,0x18,0xcf,0x01,0x01,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, v4, v255 :: v_dual_mov_b32 v7, v1 ; encoding: [0x04,0x81,0x18,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07]
+0x04,0x81,0x18,0xcf,0x01,0x01,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_add_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_ashrrev_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_cndmask_b32 v7, vcc_hi, v3, vcc_lo ; encoding: [0x6b,0x90,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6b,0x90,0x18,0xcf,0x6b,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_fmac_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x00,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x00,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshlrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x10,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x10,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_lshrrev_b32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_max_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xa0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xa0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_i32 v7, exec_lo, v3 ; encoding: [0x6b,0x80,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x80,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_min_num_f32 v7, exec_lo, v3 ; encoding: [0x6b,0xb0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0xb0,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x70,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x70,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_mul_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x30,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x30,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x50,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x50,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_sub_nc_u32 v7, exec_lo, v3 ; encoding: [0x6b,0x40,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x40,0x19,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v2 :: v_dual_subrev_f32 v7, exec_lo, v3 ; encoding: [0x6b,0x60,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6b,0x60,0x18,0xcf,0x7e,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_hi, v255 :: v_dual_mov_b32 v7, exec_lo ; encoding: [0x6b,0x80,0x18,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6b,0x80,0x18,0xcf,0x7e,0x00,0xff,0x00,0xff,0x00,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_add_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_ashrrev_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_cndmask_b32 v7, vcc_lo, v3, vcc_lo ; encoding: [0x6a,0x90,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07]
+0x6a,0x90,0x18,0xcf,0x6a,0x00,0x02,0x00,0xff,0x03,0x6a,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_fmac_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x00,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x00,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshlrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x10,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x10,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_lshrrev_b32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_max_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xa0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xa0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_i32 v7, exec_hi, v3 ; encoding: [0x6a,0x80,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x80,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_min_num_f32 v7, exec_hi, v3 ; encoding: [0x6a,0xb0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0xb0,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_dx9_zero_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x70,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x70,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_mul_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x30,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x30,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x50,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x50,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_sub_nc_u32 v7, exec_hi, v3 ; encoding: [0x6a,0x40,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x40,0x19,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v2 :: v_dual_subrev_f32 v7, exec_hi, v3 ; encoding: [0x6a,0x60,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07]
+0x6a,0x60,0x18,0xcf,0x7f,0x00,0x02,0x00,0xff,0x03,0x00,0x07
+
+# GFX1250: v_dual_subrev_f32 v255, vcc_lo, v255 :: v_dual_mov_b32 v7, exec_hi ; encoding: [0x6a,0x80,0x18,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07]
+0x6a,0x80,0x18,0xcf,0x7f,0x00,0xff,0x00,0xff,0x00,0x00,0x07
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
index 2b8d58853847b..55fdc2b15bf05 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
@@ -87,6 +87,7 @@
 # CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x05
 
-# CHECK-MI: v_pk_fmac_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+# FIXME: The instruction gets printed using the wrong function (AMDGPUInstPrinter::printOperandAndIntInputMods) and hence the "-" modifier is not printed.
+# COM: v_pk_fmac_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 0xf9,0x04,0x0a,0x78,0x01,0x06,0x06,0x16
 
diff --git a/llvm/test/MC/ELF/mc-dump.s b/llvm/test/MC/ELF/mc-dump.s
index 389941db23e3b..36d3a05768dc6 100644
--- a/llvm/test/MC/ELF/mc-dump.s
+++ b/llvm/test/MC/ELF/mc-dump.s
@@ -12,7 +12,7 @@
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 _start
 # CHECK-NEXT:0 Org Offset:3 Value:0
-# CHECK-NEXT:3 Relaxable Size:2 <MCInst #1999 <MCOperand Expr:.Ltmp0>>
+# CHECK-NEXT:3 Relaxable Size:2 <MCInst #2001 <MCOperand Expr:.Ltmp0>>
 # CHECK-NEXT:  Fixup @1 Value:.Ltmp0 Kind:4001
 # CHECK-NEXT:5 Data Size:16 [48,8b,04,25,00,00,00,00,48,8b,04,25,00,00,00,00]
 # CHECK-NEXT:  Fixup @4 Value:f0@<variant 11> Kind:4017
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index e9c2069fdbd98..c3895b524e85e 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -535,7 +535,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1890 bytes
+// R00O-NEXT:  }; // Size: 1894 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll
index f8dcd80d4e141..3394efd52a3ba 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp-recursive.ll
@@ -54,7 +54,40 @@
 ; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
 ; RUN:  --check-prefix=REMARKS
 
-; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefixes=IR,IR-INLINE
+
+;; Next, add a threshold to prevent inlining of the promoted calls which have
+;; count 2 (the default threshold of 2 means they are inlinable by default).
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN:	-memprof-icp-noinline-threshold=3 \
+; RUN:	-enable-memprof-indirect-call-support=true \
+; RUN:  -memprof-allow-recursive-callsites \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/foo.o,_ZN2B03barEj, \
+; RUN:  -r=%t/foo.o,_ZN1B3barEj, \
+; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
+; RUN:  -r=%t/main.o,_Znwm, \
+; RUN:  -r=%t/main.o,_ZdlPvm, \
+; RUN:  -r=%t/main.o,_Z8externalPi, \
+; RUN:  -r=%t/main.o,main,plx \
+; RUN:  -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN:  -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN:  -r=%t/main.o,_ZTV1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI1B,plx \
+; RUN:  -r=%t/main.o,_ZTV2B0,plx \
+; RUN:	-thinlto-threads=1 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=. -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
+; RUN:  --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefixes=IR,IR-NOINLINE
 
 ; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
 ; REMARKS: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
@@ -98,12 +131,14 @@
 ; IR:   %[[R1:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
 ; IR:   br i1 %[[R1]], label %if.true.direct_targ, label %if.false.orig_indirect
 ; IR: if.true.direct_targ:
-; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
+; IR-INLINE:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
+; IR-NOINLINE:   call {{.*}} @_ZN1B3barEj(ptr null, i32 0) #[[NOINLINE:[0-9]+]]
 ; IR: if.false.orig_indirect:
 ; IR:   %[[R2:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
 ; IR:   br i1 %[[R2]], label %if.true.direct_targ1, label %if.false.orig_indirect2
 ; IR: if.true.direct_targ1:
-; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
+; IR-INLINE:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
+; IR-NOINLINE:   call {{.*}} @_ZN2B03barEj(ptr null, i32 0) #[[NOINLINE]]
 ; IR: if.false.orig_indirect2:
 ; IR:   call {{.*}} %0
 
@@ -114,17 +149,20 @@
 ; IR:   %[[R3:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj
 ; IR:   br i1 %[[R3]], label %if.true.direct_targ, label %if.false.orig_indirect
 ; IR: if.true.direct_targ:
-; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
+; IR-INLINE:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
+; IR-NOINLINE:   call {{.*}} @_ZN1B3barEj.memprof.1(ptr null, i32 0) #[[NOINLINE]]
 ; IR: if.false.orig_indirect:
 ; IR:   %[[R4:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj
 ; IR:   br i1 %[[R4]], label %if.true.direct_targ1, label %if.false.orig_indirect2
 ; IR: if.true.direct_targ1:
-; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
+; IR-INLINE:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
+; IR-NOINLINE:   call {{.*}} @_ZN2B03barEj.memprof.1(ptr null, i32 0) #[[NOINLINE]]
 ; IR: if.false.orig_indirect2:
 ; IR:   call {{.*}} %0
 
-; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
-; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
+; IR-INLINE: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
+; IR-INLINE: attributes #[[COLD]] = {{.*}} "memprof"="cold"
+; IR-NOINLINE: attributes #[[NOINLINE]] = { noinline }
 
 ;--- foo.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/ThinLTO/X86/memprof_callee_type_mismatch.ll b/llvm/test/ThinLTO/X86/memprof_callee_type_mismatch.ll
new file mode 100644
index 0000000000000..a2cca00515732
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof_callee_type_mismatch.ll
@@ -0,0 +1,62 @@
+;; Test to ensure the callite when updated to call a clone does not mutate the
+;; callee function type. In rare cases we may end up with a callee declaration
+;; that does not match the call type, because it was imported from a different
+;; module with an incomplete return type (in which case clang gives it a void
+;; return type).
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llvm-as src.ll -o src.o
+; RUN: llvm-as src.o.thinlto.ll -o src.o.thinlto.bc
+; RUN: opt -passes=memprof-context-disambiguation src.o -S -memprof-import-summary=src.o.thinlto.bc | FileCheck %s
+
+;--- src.ll
+; ModuleID = 'src.o'
+source_filename = "src.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main(ptr %b) {
+entry:
+  ;; This call is not changed as the summary specifies clone 0.
+  ; CHECK: call ptr @_Z3foov()
+  %call = call ptr @_Z3foov(), !callsite !5
+  ;; After changing this call to call a clone, the function type should still
+  ;; be ptr, despite the void on the callee declaration.
+  ; CHECK: call ptr @_Z3foov.memprof.1()
+  %call1 = call ptr @_Z3foov(), !callsite !6
+  %0 = load ptr, ptr %b, align 8
+  ;; Although the summary indicates this should call clone 1, and the VP
+  ;; metadata indicates the callee is _Z3foov, it is not updated because
+  ;; the ICP facility requires the function types to match.
+  ; CHECK: call ptr %0()
+  %call2 = call ptr %0(), !prof !7, !callsite !8
+  ret i32 0
+}
+
+;; Both the original callee function declaration and its clone have void return
+;; type.
+; CHECK: declare void @_Z3foov()
+; CHECK: declare void @_Z3foov.memprof.1()
+declare void @_Z3foov()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "src.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"EnableSplitLTOUnit", i32 0}
+!5 = !{i64 8632435727821051414}
+!6 = !{i64 -3421689549917153178}
+!7 = !{!"VP", i32 0, i64 4, i64 9191153033785521275, i64 4}
+!8 = !{i64 1234}
+
+;--- src.o.thinlto.ll
+; ModuleID = 'src.o.thinlto.bc'
+source_filename = "src.o.thinlto.bc"
+
+^0 = module: (path: "src.o", hash: (2823430083, 3994560862, 899296057, 1055405378, 2961356784))
+^1 = gv: (guid: 15822663052811949562, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414)), (callee: null, clones: (1), stackIds: (15025054523792398438)), (callee: null, clones: (1), stackIds: (1234))))))
+^2 = flags: 353
+^3 = blockcount: 0
diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg-nested-struct.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg-nested-struct.ll
new file mode 100644
index 0000000000000..12dfa16991326
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-dbg-nested-struct.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes='cgscc(coro-split)' -S | FileCheck %s
+
+; Test that nested structs in coroutine frames have correct debug info scoping.
+
+; Minimal nested struct types that used to trigger a scoping issue:
+; we used to set the wrong `scope` for the `DIDerivedType` member entries of the `DICompositeType`
+; as well as the `scope` for `DICompositeType` for the inner struct itself.
+%"struct.Inner" = type { i32, ptr }
+%"struct.Outer" = type { %"struct.Inner", i64 }
+%"class.Promise" = type { %"struct.Outer" }
+
+define void @test_coro_function() presplitcoroutine !dbg !10 {
+entry:
+  %__promise = alloca %"class.Promise", align 8
+  %0 = call token @llvm.coro.id(i32 0, ptr %__promise, ptr null, ptr null)
+  %1 = call ptr @llvm.coro.begin(token %0, ptr null)
+  %2 = call token @llvm.coro.save(ptr null)
+  ret void
+}
+
+; CHECK: define void @test_coro_function()
+
+; Check that frame debug info is generated
+; CHECK: ![[FRAME_TYPE:[0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "{{.*}}.coro_frame_ty"
+
+; Key validation: Check that nested structs have the correct scope hierarchy
+; 1. Promise should be scoped to the frame
+; CHECK: ![[PROMISE:[0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "class_Promise", scope: ![[FRAME_TYPE]]
+
+; 2. Members of Promise should be scoped to Promise (check this before Outer since it comes first in output)
+; CHECK: !DIDerivedType(tag: DW_TAG_member, name: "struct_Outer", scope: ![[PROMISE]]
+
+; 3. Outer should be scoped to Promise (not the frame!)
+; CHECK: ![[OUTER:[0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "struct_Outer", scope: ![[PROMISE]]
+
+; 4. First Outer member should be scoped to Outer
+; CHECK: !DIDerivedType(tag: DW_TAG_member, name: "struct_Inner", scope: ![[OUTER]]
+
+; 5. Inner should be scoped to Outer (proper nesting)
+; CHECK: ![[INNER:[0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "struct_Inner", scope: ![[OUTER]]
+
+; 6. Members of Inner should be scoped to Inner
+; CHECK: !DIDerivedType(tag: DW_TAG_member, name: "__int_32", scope: ![[INNER]]
+; CHECK: !DIDerivedType(tag: DW_TAG_member, name: "PointerType", scope: ![[INNER]]
+
+; 7. Second Outer member comes after Inner (due to output order)
+; CHECK: !DIDerivedType(tag: DW_TAG_member, name: "__int_64", scope: ![[OUTER]]
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.cpp", directory: ".")
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "test_coro_function", scope: !1, file: !1, line: 1, type: !11, spFlags: DISPFlagDefinition, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{null}
diff --git a/llvm/test/Transforms/InstCombine/icmp_or_umul_overflow.ll b/llvm/test/Transforms/InstCombine/icmp_or_umul_overflow.ll
new file mode 100644
index 0000000000000..13c7fce38ef01
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp_or_umul_overflow.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+declare void @use.i1(i1 %x)
+declare void @use.i64(i64 %x)
+declare void @use.i64i1({i64, i1} %x)
+
+define i1 @umul_greater_than_or_overflow_const(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[IN]], 109802048057794950
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 168)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, -16
+  %ret = or i1 %ovf, %cmp
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_i8(i8 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_i8(
+; CHECK-SAME: i8 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[IN]], 10
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %in, i8 24)
+  %mul = extractvalue { i8, i1 } %mwo, 0
+  %ovf = extractvalue { i8, i1 } %mwo, 1
+  %cmp = icmp ugt i8 %mul, -16
+  %ret = or i1 %ovf, %cmp
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_commuted(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_commuted(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[IN]], 192153584101141162
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or i1 %cmp, %ovf
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_disjoint(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_disjoint(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[IN]], 230584300921369395
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 40)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or disjoint i1 %ovf, %cmp
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_multiuse_mul(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_multiuse_mul(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[IN]], 48
+; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i64 [[IN]], 192153584101141162
+; CHECK-NEXT:    tail call void @use.i64(i64 [[MUL]])
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or i1 %ovf, %cmp
+  tail call void @use.i64(i64 %mul)
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_multiuse_overflow(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_multiuse_overflow(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[OVF:%.*]] = icmp ugt i64 [[IN]], 384307168202282325
+; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i64 [[IN]], 192153584101141162
+; CHECK-NEXT:    tail call void @use.i1(i1 [[OVF]])
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or i1 %ovf, %cmp
+  tail call void @use.i1(i1 %ovf)
+  ret i1 %ret
+}
+
+define i1 @umul_greater_than_or_overflow_const_multiuse_umul_call(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_multiuse_umul_call(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[MWO:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[IN]], i64 48)
+; CHECK-NEXT:    [[RET:%.*]] = icmp ugt i64 [[IN]], 192153584101141162
+; CHECK-NEXT:    tail call void @use.i64i1({ i64, i1 } [[MWO]])
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or i1 %ovf, %cmp
+  tail call void @use.i64i1({ i64, i1 } %mwo)
+  ret i1 %ret
+}
+
+define <2 x i1> @umul_greater_than_or_overflow_const_vector_splat(<2 x i64> %in) {
+; CHECK-LABEL: define <2 x i1> @umul_greater_than_or_overflow_const_vector_splat(
+; CHECK-SAME: <2 x i64> [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt <2 x i64> [[IN]], splat (i64 6477087104532848)
+; CHECK-NEXT:    ret <2 x i1> [[TMP6]]
+;
+  %mwo = tail call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> %in, <2 x i64> <i64 1424, i64 1424>)
+  %mul = extractvalue { <2 x i64>, <2 x i1> } %mwo, 0
+  %ovf = extractvalue { <2 x i64>, <2 x i1> } %mwo, 1
+  %cmp = icmp ugt <2 x i64> %mul, <i64 9223372036854775800, i64 9223372036854775800>
+  %ret = or <2 x i1> %ovf, %cmp
+  ret <2 x i1> %ret
+}
+
+; Negative test
+define <4 x i1> @umul_greater_than_or_overflow_const_vector_non_splat_negative(<4 x i64> %in) {
+; CHECK-LABEL: define <4 x i1> @umul_greater_than_or_overflow_const_vector_non_splat_negative(
+; CHECK-SAME: <4 x i64> [[IN:%.*]]) {
+; CHECK-NEXT:    [[MWO:%.*]] = tail call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[IN]], <4 x i64> <i64 24, i64 1424, i64 0, i64 -1>)
+; CHECK-NEXT:    [[MUL:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[MWO]], 0
+; CHECK-NEXT:    [[OVF:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[MWO]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <4 x i64> [[MUL]], <i64 9223372036854775000, i64 9223372036854775800, i64 -16, i64 -16>
+; CHECK-NEXT:    [[RET:%.*]] = or <4 x i1> [[OVF]], [[CMP]]
+; CHECK-NEXT:    ret <4 x i1> [[RET]]
+;
+  %mwo = tail call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v2i64(<4 x i64> %in, <4 x i64> <i64 24, i64 1424, i64 0, i64 -1>)
+  %mul = extractvalue { <4 x i64>, <4 x i1> } %mwo, 0
+  %ovf = extractvalue { <4 x i64>, <4 x i1> } %mwo, 1
+  %cmp = icmp ugt <4 x i64> %mul, <i64 9223372036854775000, i64 9223372036854775800, i64 -16, i64 -16>
+  %ret = or <4 x i1> %ovf, %cmp
+  ret <4 x i1> %ret
+}
+
+; Negative test
+define <2 x i1> @umul_greater_than_or_overflow_const_vector_poison_non_splat_negative(<2 x i64> %in) {
+; CHECK-LABEL: define <2 x i1> @umul_greater_than_or_overflow_const_vector_poison_non_splat_negative(
+; CHECK-SAME: <2 x i64> [[IN:%.*]]) {
+; CHECK-NEXT:    [[MWO:%.*]] = tail call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[IN]], <2 x i64> <i64 poison, i64 1424>)
+; CHECK-NEXT:    [[MUL:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[MWO]], 0
+; CHECK-NEXT:    [[OVF:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[MWO]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i64> [[MUL]], <i64 9223372036854775800, i64 poison>
+; CHECK-NEXT:    [[RET:%.*]] = or <2 x i1> [[OVF]], [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> [[RET]]
+;
+  %mwo = tail call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> %in, <2 x i64> <i64 poison, i64 1424>)
+  %mul = extractvalue { <2 x i64>, <2 x i1> } %mwo, 0
+  %ovf = extractvalue { <2 x i64>, <2 x i1> } %mwo, 1
+  %cmp = icmp ugt <2 x i64> %mul, <i64 9223372036854775800, i64 poison>
+  %ret = or <2 x i1> %ovf, %cmp
+  ret <2 x i1> %ret
+}
+
+; Negative test
+define i1 @umul_greater_than_and_overflow_const_negative(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_and_overflow_const_negative(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[IN]], i64 48)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP3]], 9223372036854775800
+; CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ult i64 %mul, 9223372036854775800
+  %ret = and i1 %ovf, %cmp
+  ret i1 %ret
+}
+
+; Negative test
+define i1 @umul_less_than_or_overflow_const_negative(i64 %in) {
+; CHECK-LABEL: define i1 @umul_less_than_or_overflow_const_negative(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[IN]], i64 48)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP3]], 9223372036854775800
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ult i64 %mul, 9223372036854775800
+  %ret = or i1 %ovf, %cmp
+  ret i1 %ret
+}
+
+; Negative test
+define i1 @umul_greater_than_or_overflow_const_multiuse_icmp_negative(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_multiuse_icmp_negative(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[IN]], i64 48)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], 9223372036854775800
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    tail call void @use.i1(i1 [[TMP5]])
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %mwo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 48)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 9223372036854775800
+  %ret = or i1 %ovf, %cmp
+  tail call void @use.i1(i1 %cmp)
+  ret i1 %ret
+}
+
+; Negative test. The umul.with.overflow should be folded away before.
+define i1 @umul_greater_than_or_overflow_const_0_negative(i64 %in) {
+; CHECK-LABEL: define i1 @umul_greater_than_or_overflow_const_0_negative(
+; CHECK-SAME: i64 [[IN:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %mwo = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %in, i64 0)
+  %mul = extractvalue { i64, i1 } %mwo, 0
+  %ovf = extractvalue { i64, i1 } %mwo, 1
+  %cmp = icmp ugt i64 %mul, 0
+  %ret = or i1 %ovf, %cmp
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
index 3c1094f2ee31d..ff2527d5bb6ad 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
@@ -131,12 +131,11 @@ define i32 @negative_test_type_is_struct(i32 %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:  .LBB2_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    ldr w9, [x1], #4
 ; CHECK-NEXT:    cbnz w9, .LBB2_5
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    add x1, x1, #4
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:  .LBB2_4:
 ; CHECK-NEXT:    mov w0, wzr
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index 9fee8a390504a..61ef3cef603fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -14,16 +14,15 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
@@ -31,43 +30,13 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 32
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 48
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP40]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP38]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP41]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <vscale x 16 x i1> [[TMP11]], [[TMP30]]
-; CHECK-NEXT:    [[TMP34:%.*]] = or <vscale x 16 x i1> [[TMP33]], [[TMP31]]
-; CHECK-NEXT:    [[TMP35:%.*]] = or <vscale x 16 x i1> [[TMP34]], [[TMP32]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP35]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -77,26 +46,7 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
-; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
-; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP42]], 3
-; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
-; CHECK-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP42]], 2
-; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP46]], [[TMP42]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
-; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP42]], 1
-; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp ne i64 [[TMP52]], [[TMP42]]
-; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
-; CHECK-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne i64 [[TMP15]], [[TMP42]]
-; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
+; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
index 5efd821ba990f..3cde3f3422cf9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
 ; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s
 ; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
@@ -25,14 +26,10 @@ define void @derived_int_ivs(ptr noalias %a, ptr noalias %b, i64 %end) {
 ; VF2-NEXT:    [[TMP5:%.*]] = mul i64 [[INDEX]], 16
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 16, [[TMP5]]
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP6]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[STRIDED_VEC]], <2 x double> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -40,6 +37,41 @@ define void @derived_int_ivs(ptr noalias %a, ptr noalias %b, i64 %end) {
 ; VF2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; VF2:       [[SCALAR_PH]]:
 ;
+; VF2IC2-LABEL: define void @derived_int_ivs(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[END:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[END]], -32
+; VF2IC2-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 4
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; VF2IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 16
+; VF2IC2-NEXT:    [[TMP4:%.*]] = add i64 16, [[TMP3]]
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = mul i64 [[INDEX]], 16
+; VF2IC2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 16, [[TMP5]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 16
+; VF2IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_IDX]]
+; VF2IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP8]], align 8
+; VF2IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2IC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; VF2IC2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[TMP9]], align 8
+; VF2IC2-NEXT:    store <2 x double> [[WIDE_LOAD1]], ptr [[TMP10]], align 8
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2IC2:       [[SCALAR_PH]]:
+;
 ; VF4-LABEL: define void @derived_int_ivs(
 ; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[END:%.*]]) {
 ; VF4-NEXT:  [[ENTRY:.*:]]
@@ -135,13 +167,9 @@ define void @derived_pointer_ivs(ptr noalias %a, ptr noalias %b, ptr %end) {
 ; VF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
 ; VF2-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX6]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[NEXT_GEP]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[STRIDED_VEC]], <2 x double> [[STRIDED_VEC8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP7]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[NEXT_GEP]], align 8
+; VF2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[NEXT_GEP7]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF2-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -149,6 +177,61 @@ define void @derived_pointer_ivs(ptr noalias %a, ptr noalias %b, ptr %end) {
 ; VF2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; VF2:       [[SCALAR_PH]]:
 ;
+; VF2IC2-LABEL: define void @derived_pointer_ivs(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr [[END:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    [[A5:%.*]] = ptrtoint ptr [[A]] to i64
+; VF2IC2-NEXT:    [[END4:%.*]] = ptrtoint ptr [[END]] to i64
+; VF2IC2-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; VF2IC2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[END4]], -16
+; VF2IC2-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[A5]]
+; VF2IC2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; VF2IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; VF2IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; VF2IC2:       [[VECTOR_MEMCHECK]]:
+; VF2IC2-NEXT:    [[TMP4:%.*]] = add i64 [[END1]], -16
+; VF2IC2-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], [[A2]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
+; VF2IC2-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4
+; VF2IC2-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 16
+; VF2IC2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; VF2IC2-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP8]]
+; VF2IC2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP3]]
+; VF2IC2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; VF2IC2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; VF2IC2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; VF2IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; VF2IC2-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 16
+; VF2IC2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; VF2IC2-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; VF2IC2-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; VF2IC2-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 16
+; VF2IC2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
+; VF2IC2-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP13]]
+; VF2IC2-NEXT:    [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], 16
+; VF2IC2-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX7]], 16
+; VF2IC2-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX7]]
+; VF2IC2-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[NEXT_GEP]], align 8
+; VF2IC2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x double>, ptr [[NEXT_GEP6]], align 8
+; VF2IC2-NEXT:    store <2 x double> [[WIDE_LOAD]], ptr [[NEXT_GEP8]], align 8
+; VF2IC2-NEXT:    store <2 x double> [[WIDE_LOAD10]], ptr [[NEXT_GEP9]], align 8
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; VF2IC2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2IC2:       [[SCALAR_PH]]:
+;
 ; VF4-LABEL: define void @derived_pointer_ivs(
 ; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr [[END:%.*]]) {
 ; VF4-NEXT:  [[ENTRY:.*:]]
@@ -235,21 +318,43 @@ define void @narrow_with_uniform_add_and_gep(ptr noalias %p) {
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP0]]
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 1)
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; VF2:       [[SCALAR_PH]]:
 ;
+; VF2IC2-LABEL: define void @narrow_with_uniform_add_and_gep(
+; VF2IC2-SAME: ptr noalias [[P:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP0]], 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[WIDE_LOAD]], splat (i64 1)
+; VF2IC2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[WIDE_LOAD1]], splat (i64 1)
+; VF2IC2-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; VF2IC2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2IC2:       [[SCALAR_PH]]:
+;
 ; VF4-LABEL: define void @narrow_with_uniform_add_and_gep(
 ; VF4-SAME: ptr noalias [[P:%.*]]) {
 ; VF4-NEXT:  [[ENTRY:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 6dc17e0993a22..0f99ed576f1fe 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -15,22 +15,10 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
-; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10)
-; VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10)
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP6]]
-; VF4IC4-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP8]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -101,31 +89,13 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
-; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
-; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -134,20 +104,7 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
-; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
-; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
-; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -210,22 +167,10 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
-; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
-; VF4IC4-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], splat (i8 72)
-; VF4IC4-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], splat (i8 72)
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72)
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP15]]
-; VF4IC4-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP16]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP17]]
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -234,20 +179,7 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
-; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP16]], i1 true)
-; VF4IC4-NEXT:    [[TMP19:%.*]] = add i64 8, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP30]], 4
-; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i64 [[TMP19]], i64 [[TMP29]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
-; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 4, [[TMP21]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 4
-; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i64 [[TMP23]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
-; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP25]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp ne i64 [[TMP25]], 4
-; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP26]], i64 [[TMP27]], i64 [[TMP24]]
+; VF4IC4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -304,31 +236,13 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
-; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
-; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -337,20 +251,7 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
-; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
-; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
-; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
@@ -414,31 +315,13 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
-; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
-; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -447,20 +330,7 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
-; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
-; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
-; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -531,31 +401,13 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
-; VF4IC4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
-; VF4IC4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP22]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
-; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
-; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -564,20 +416,7 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP33:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[TMP34:%.*]] = add i64 12, [[TMP33]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
-; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 8, [[TMP35]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP35]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i64 [[TMP24]], i64 [[TMP34]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 4, [[TMP26]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 4
-; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
-; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP30]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
@@ -648,48 +487,18 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
-; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
-; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
-; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
-; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
-; VF4IC4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -4
-; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 -3
-; VF4IC4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -8
-; VF4IC4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 -3
-; VF4IC4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -12
-; VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP45]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
-; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
-; VF4IC4-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD10]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP44]], align 1
-; VF4IC4-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD12]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP46]], align 1
 ; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE8]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE10]], [[REVERSE11]]
-; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE12]], [[REVERSE13]]
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
-; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]]
-; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
-; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4IC4:       middle.split:
@@ -697,25 +506,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
-; VF4IC4-NEXT:    [[TMP42:%.*]] = add i64 12, [[TMP41]]
-; VF4IC4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
-; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 8, [[TMP43]]
-; VF4IC4-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP43]], 4
-; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i64 [[TMP32]], i64 [[TMP42]]
-; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
-; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 4, [[TMP34]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[TMP34]], 4
-; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP33]]
-; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; VF4IC4-NEXT:    [[TMP40:%.*]] = add i64 0, [[TMP38]]
-; VF4IC4-NEXT:    [[TMP39:%.*]] = icmp ne i64 [[TMP38]], 4
-; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP39]], i64 [[TMP40]], i64 [[TMP37]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -774,31 +570,13 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 8
-; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 12
-; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP29]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP4]], [[TMP11]]
-; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
-; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]]
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -808,20 +586,7 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
-; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
-; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i64 [[TMP22]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
-; VF4IC4-NEXT:    [[TMP25:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
-; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
-; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i64 [[TMP25]], i64 [[TMP23]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE10]]
-; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE10]], 4
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP27]], i64 [[TMP28]], i64 [[TMP26]]
-; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD3]], i64 [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
 ; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -884,48 +649,18 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
-; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
-; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 -3
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
-; VF4IC4-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
-; VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
-; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
-; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP37]], align 1
-; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; VF4IC4-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP38]], i32 -3
-; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -4
-; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
-; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -8
-; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
-; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -12
-; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP39]], align 1
-; VF4IC4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD7]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
-; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
-; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP38]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE8]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
-; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP6]], [[TMP19]]
-; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
-; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP21]]
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
-; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = or i1 [[TMP25]], [[TMP26]]
 ; VF4IC4-NEXT:    br i1 [[TMP27]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4IC4:       middle.split:
@@ -934,23 +669,10 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
-; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE1]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
-; VF4IC4-NEXT:    [[TMP30:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
-; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
-; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i64 [[TMP30]], i64 [[TMP28]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
-; VF4IC4-NEXT:    [[TMP33:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
-; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
-; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP32]], i64 [[TMP33]], i64 [[TMP31]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE17:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
-; VF4IC4-NEXT:    [[TMP36:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE17]]
-; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE17]], 4
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = select i1 [[TMP35]], i64 [[TMP36]], i64 [[TMP34]]
-; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE]], i64 [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE6]], i64 [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vect.stats.ll b/llvm/test/Transforms/LoopVectorize/vect.stats.ll
index 9a55dc99c316b..018e2c213ddf2 100644
--- a/llvm/test/Transforms/LoopVectorize/vect.stats.ll
+++ b/llvm/test/Transforms/LoopVectorize/vect.stats.ll
@@ -1,12 +1,12 @@
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize --disable-output -stats -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization --disable-output -stats -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-;
-; We have 2 loops, one of them is vectorizable and the second one is not.
-;
+; We have 3 loops, two of them are vectorizable (with one being early-exit
+; vectorized) and the third one is not.
 
-; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
-; CHECK: 1 loop-vectorize               - Number of loops vectorized
+; CHECK: 3 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of early exit loops vectorized
+; CHECK: 2 loop-vectorize               - Number of loops vectorized
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -31,6 +31,36 @@ for.end:                                          ; preds = %entry, %for.body
   ret void
 }
 
+define i32 @early_exit_vectorized(i64 %end) {
+entry:
+  %p1 = alloca [1024 x i32]
+  %p2 = alloca [1024 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  %end.clamped = and i64 %end, 1023
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ %ind.next, %for.inc ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind
+  %1 = load i32, ptr %arrayidx2, align 4
+  %cmp.early = icmp eq i32 %0, %1
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %ind.next = add i64 %ind, 1
+  %cmp = icmp ult i64 %ind.next, %end.clamped
+  br i1 %cmp, label %for.body, label %exit
+
+found:
+  ret i32 1
+
+exit:
+  ret i32 0
+}
+
 define void @not_vectorized(ptr nocapture %a, i64 %size) {
 entry:
   %cmp1 = icmp sle i64 %size, 0
@@ -56,3 +86,5 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %entry, %for.body
   ret void
 }
+
+declare void @init_mem(ptr, i64);
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 2f192ad7a9ea9..fdd5e0e7958ec 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -55,17 +55,18 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2:       [[VECTOR_PH]]:
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
-; VF8UF2-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP3]], [[TMP6]]
-; VF8UF2-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]])
-; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
@@ -82,7 +83,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2:       [[LOOP_LATCH]]:
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i8 [[RES]]
@@ -192,27 +193,23 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[VECTOR_PH]]:
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
-; VF8UF2-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP3]], [[TMP6]]
-; VF8UF2-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]])
-; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP6]], i1 true)
-; VF8UF2-NEXT:    [[TMP10:%.*]] = add i64 8, [[TMP8]]
-; VF8UF2-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
-; VF8UF2-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
-; VF8UF2-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE]], 8
-; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP5]], i64 [[TMP10]]
-; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]
 ; VF8UF2:       [[SCALAR_PH]]:
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -226,9 +223,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[LOOP_LATCH]]:
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i64 [[RES]]
 ;
 ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 5cb2c4530aa57..8e25c9c5547d6 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -567,22 +567,19 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
 ;
 ; SSE4-LABEL: @buildvector_mul_subadd_ps256(
 ; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
 ; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE4-NEXT:    [[TMP5:%.*]] = fsub <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; SSE4-NEXT:    ret <8 x float> [[TMP6]]
 ;
 ; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256(
 ; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
-; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
 ; AVX_FMA4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; AVX_FMA4-NEXT:    ret <8 x float> [[TMP6]]
@@ -677,13 +674,11 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_ps512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
-; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
 ; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = fsub <16 x float> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 ; AVX_FMA-NEXT:    ret <16 x float> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_ps512(
@@ -880,13 +875,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_pd512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
-; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
 ; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; AVX_FMA-NEXT:    ret <8 x double> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_pd512(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
index 9f9e9d84108e6..9c615bb4757fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
@@ -12,9 +12,10 @@ define void @foo(ptr %0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP6]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x ptr> @llvm.vector.insert.v8p0.v4p0(<8 x ptr> poison, <4 x ptr> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x ptr> @llvm.vector.insert.v8p0.v4p0(<8 x ptr> [[TMP11]], <4 x ptr> [[TMP5]], i64 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP7]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP12]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <8 x ptr> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP10]])
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 9327fe8995d45..8d44d03e0e5cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -11,7 +11,7 @@ define i32 @test(ptr %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v6i64(<8 x i64> poison, <6 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 7ae44c274ff6d..fcbe2d631ba8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -164,7 +164,8 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 6c5220d13b7a2..bb05440910130 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -420,27 +420,26 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; TODO: Dead code must be removed below.
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP4]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
-; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
 ; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP6]] to i32
@@ -454,17 +453,17 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
 ; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
-; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP10]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
-; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
-; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP11]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
-; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP12]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
-; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
-; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP13]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
 ; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP14]] to i32
@@ -478,17 +477,17 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
 ; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP17]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP18]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP19]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
-; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
-; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP20]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
-; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
-; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP21]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
 ; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32
@@ -519,28 +518,35 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
 ; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
 ; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP38:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP38]], <4 x i8> [[TMP4]], i64 4)
-; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP1]], i64 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP5]], i64 12)
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP46:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP45]], <4 x i8> [[TMP12]], i64 4)
-; CHECK-NEXT:    [[TMP47:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP46]], <4 x i8> [[TMP3]], i64 8)
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP47]], <4 x i8> [[TMP13]], i64 12)
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <16 x i32> [[TMP11]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]])
-; CHECK-NEXT:    ret i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP36]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP37]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i8> [[TMP42]], <16 x i8> [[TMP43]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP45:%.*]] = zext <16 x i8> [[TMP44]] to <16 x i32>
+; CHECK-NEXT:    [[TMP46:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP46]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP35]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i8> [[TMP50]], <16 x i8> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP47]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = mul <16 x i32> [[TMP45]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP56]])
+; CHECK-NEXT:    ret i32 [[TMP57]]
 ;
+; TODO: Dead code must be removed below.
 entry:
   %idx.ext = sext i32 %off1 to i64
   %idx.ext63 = sext i32 %off2 to i64
@@ -1016,69 +1022,68 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; TODO: Dead code must be removed below.
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP4]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
-; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
-; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
-; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP6]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
-; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
-; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
-; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP8]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
-; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
-; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP10]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
-; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP11]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
-; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
-; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP12]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
-; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
-; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP13]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
-; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
-; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP14]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
-; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
-; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP15]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP16]] to i32
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP17]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP18]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP19]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
 ; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP20]] to i32
@@ -1118,32 +1123,33 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
 ; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
 ; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[DST0]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[DST4]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST8]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[DST12]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i8> [[TMP32]] to <4 x i32>
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = mul <4 x i32> [[TMP33]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = zext <4 x i8> [[TMP37]] to <4 x i32>
+; CHECK-NEXT:    [[TMP39:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = mul <4 x i32> [[TMP38]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
+; CHECK-NEXT:    [[TMP46:%.*]] = mul <4 x i32> [[TMP43]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
+; CHECK-NEXT:    [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
+; CHECK-NEXT:    [[TMP51:%.*]] = mul <4 x i32> [[TMP48]], [[TMP50]]
+; CHECK-NEXT:    store <4 x i32> [[TMP36]], ptr [[DST0]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP41]], ptr [[DST4]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP46]], ptr [[DST8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP51]], ptr [[DST12]], align 4
 ; CHECK-NEXT:    ret void
 ;
+; TODO: Dead code must be removed below.
 entry:
   %idx.ext = sext i32 %off1 to i64
   %idx.ext63 = sext i32 %off2 to i64
@@ -1422,29 +1428,41 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP13]], <4 x i8> [[TMP4]], i64 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP14]], <4 x i8> [[TMP8]], i64 8)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP12]], i64 12)
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP5]], i64 4)
-; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP20]], <4 x i8> [[TMP9]], i64 8)
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP19]], i64 12)
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP29]], <4 x i8> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP28:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP30]], <4 x i8> [[TMP10]], i64 8)
-; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP28]], <4 x i8> [[TMP27]], i64 12)
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP67]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP36:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP35]], <4 x i8> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP37:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP36]], <4 x i8> [[TMP11]], i64 8)
-; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP37]], <4 x i8> [[TMP34]], i64 12)
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <16 x i8> [[TMP70]], <16 x i8> [[TMP71]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP72]], <16 x i8> [[TMP73]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index 07411cacb3626..9562e6d41f7cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -29,14 +29,21 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP83]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <64 x i16> [[TMP84]], <64 x i16> [[TMP85]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <64 x i16> [[TMP86]], <64 x i16> [[TMP87]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP88]], <64 x i16> [[TMP89]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..0e3d79900d435 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
 ; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
 
 define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT:    ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT:  [[ENTRY:.*:]]
+; NOFP16-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT:    ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT:  [[ENTRY:.*:]]
+; FULLFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
 
 define half @reduce_half2(<2 x half> %vec2) {
 ; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
 ; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
 ; CHECK-LABEL: define float @reduce_fast_float2(
 ; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
 ; CHECK-NEXT:    ret float [[ADD1]]
 ;
 entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
 ; CHECK-LABEL: define double @reduce_fast_double2(
 ; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
 ; CHECK-NEXT:    ret double [[ADD1]]
 ;
 entry:
@@ -552,8 +554,9 @@ define float @reduce_fast_float_case2(ptr %a, ptr %b) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[TMP0]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[RED3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    ret float [[RED3]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index a504f3ed02014..64bdcf28af550 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -15,7 +15,8 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP2]], <2 x float> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP9]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 4f88182374622..0783a28f56d85 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -13,7 +13,8 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
index 2191d04cd797d..833bc56c4ec6b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
@@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POWER-OF-2-NEXT:  entry:
 ; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
 ; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
 ; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POWER-OF-2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POWER-OF-2-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index 61a944101586b..c728572313d77 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -253,13 +253,14 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1)
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll
index cd79250e8fb6b..b772e4be3b0aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll
@@ -7,8 +7,9 @@ define void @test(ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[P1]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 15425c38bbb04..5ee9f3ca46ca8 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -88,7 +88,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
+; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP117]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
@@ -112,7 +113,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
+; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i32> [[TMP91]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <8 x i32> [[TMP93]], <8 x i32> [[TMP118]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
 ; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
@@ -220,7 +222,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; THR15-NEXT:    [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
 ; THR15-NEXT:    [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; THR15-NEXT:    [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
+; THR15-NEXT:    [[TMP116:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THR15-NEXT:    [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP116]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; THR15-NEXT:    [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
 ; THR15-NEXT:    [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; THR15-NEXT:    [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
@@ -244,7 +247,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
 ; THR15-NEXT:    [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
 ; THR15-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
+; THR15-NEXT:    [[TMP117:%.*]] = shufflevector <4 x i32> [[TMP91]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THR15-NEXT:    [[TMP94:%.*]] = shufflevector <8 x i32> [[TMP93]], <8 x i32> [[TMP117]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; THR15-NEXT:    [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
 ; THR15-NEXT:    [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index cc88718484172..82c940353ba5a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -14,8 +14,9 @@ define i16 @test(ptr %i) {
 ; CHECK:       [[FOR_COND5_US]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP6]], <4 x i16> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP8]], i16 0)
 ; CHECK-NEXT:    ret i16 [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
index 9269a710c61d3..8e80aee7070a9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
@@ -11,11 +11,12 @@ define void @test(ptr %c) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 345, i64 348, i64 351, i64 354, i64 357, i64 360, i64 363>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP3]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label %[[FOR_COND:.*]]
 ; CHECK:       [[FOR_COND]]:
 ; CHECK-NEXT:    [[A_PROMOTED2226:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FOR_COND]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP5]], i64 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP7]])
 ; CHECK-NEXT:    br label %[[FOR_COND]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
index 78b5acad0df9a..457f2600b539f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
@@ -45,12 +45,14 @@ define float @test(ptr %x) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = fadd fast <8 x float> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[RDX_OP]], i64 0)
-; CHECK-NEXT:    [[RDX_OP4:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v16f32(<16 x float> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[RDX_OP]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[RDX_OP4:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP5:%.*]] = fadd fast <4 x float> [[RDX_OP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> [[RDX_OP5]], i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[RDX_OP5]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP9]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP8]])
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
 ; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 1e7cc9c268cfa..b6a40f0162bbd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -12,7 +12,8 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
 ; CHECK-NEXT:    store double [[TMP2]], ptr [[PHASES_IN]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index db09843a6ef72..5bc2e94485432 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -1027,8 +1027,9 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP5]]
@@ -1075,8 +1076,9 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
index 258b0ec0bcfc7..f6e4643006816 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -17,12 +17,13 @@ define void @test1(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[VEXT165_I]], i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP0]], <4 x float> [[VEXT309_I]], i64 4)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[LOAD17:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[FMULADD7:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP5]], <4 x float> [[FMULADD16:%.*]], i64 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[VEXT165_I]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[VEXT309_I]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[LOAD17:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[FMULADD7:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[FMULADD16:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
 ; CHECK-NEXT:    ret void
@@ -55,12 +56,13 @@ define void @test2(<8 x float> %load6, <8 x float> %load7, <8 x float> %load8, <
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VEXT165_I:%.*]] = shufflevector <8 x float> [[LOAD6:%.*]], <8 x float> [[LOAD7:%.*]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[VEXT309_I:%.*]] = shufflevector <8 x float> [[LOAD7]], <8 x float> [[LOAD8:%.*]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[VEXT165_I]], i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[VEXT309_I]], i64 8)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[LOAD17:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[FMULADD7:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP5]], <8 x float> [[FMULADD16:%.*]], i64 8)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x float> [[VEXT165_I]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[VEXT309_I]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[LOAD17:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[FMULADD7:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[FMULADD16:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP4]], <16 x float> [[TMP6]])
 ; CHECK-NEXT:    store <16 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..da08718d5c248 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -8,8 +8,7 @@ define i32 @test() {
 ; CHECK-NEXT:    br label [[IF_END_I87:%.*]]
 ; CHECK:       if.end.i87:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 undef, i32 undef, i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    switch i32 0, label [[SW_BB509_I:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[SW_BB509_I]]
 ; CHECK-NEXT:      i32 0, label [[IF_THEN458_I:%.*]]
@@ -51,21 +50,15 @@ define void @test2() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> [[TMP7]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x float> [[TMP10]], <32 x float> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <32 x double> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fcmp ogt <32 x float> zeroinitializer, [[TMP15]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -101,20 +94,17 @@ define void @test3(float %0) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY_LR_PH:%.*]]
 ; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP1]], <2 x float> zeroinitializer, i64 2)
 ; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x float> [ zeroinitializer, [[FOR_BODY_LR_PH]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x float> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> splat (i1 true), i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP6]], <2 x i1> [[TMP5]], i64 2)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i1> [[TMP5]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i1> <i1 true, i1 true, i1 undef, i1 undef>, <4 x i1> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP9]], <4 x float> [[TMP6]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ;
 entry:
@@ -142,24 +132,25 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
 ; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
-; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
-; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; POWEROF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; POWEROF2-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; POWEROF2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
-; POWEROF2:       7:
-; POWEROF2-NEXT:    br label [[TMP8]]
 ; POWEROF2:       8:
+; POWEROF2-NEXT:    br label [[TMP8]]
+; POWEROF2:       9:
 ; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
 ; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
 ; POWEROF2-NEXT:    br label [[TMP11:%.*]]
-; POWEROF2:       11:
-; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
+; POWEROF2:       12:
+; POWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
+; POWEROF2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
+; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
 ; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
 ; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
@@ -176,18 +167,19 @@ define ptr @test4() {
 ; NONPOWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; NONPOWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
-; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
-; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
+; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; NONPOWEROF2-NEXT:    [[TMP18:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = shufflevector <6 x float> [[TMP4]], <6 x float> [[TMP18]], <6 x i32> <i32 0, i32 1, i32 2, i32 6, i32 7, i32 8>
 ; NONPOWEROF2-NEXT:    br label [[TMP7:%.*]]
-; NONPOWEROF2:       6:
-; NONPOWEROF2-NEXT:    br label [[TMP7]]
 ; NONPOWEROF2:       7:
+; NONPOWEROF2-NEXT:    br label [[TMP7]]
+; NONPOWEROF2:       8:
 ; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
 ; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
-; NONPOWEROF2:       9:
-; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
+; NONPOWEROF2:       10:
+; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
-; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
 ; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
 ; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
 ; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
@@ -235,21 +227,9 @@ define ptr @test4() {
 define i32 @test5() {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> [[TMP0]], <2 x double> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP3]], <2 x double> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP4]], <2 x double> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP5]], <2 x double> zeroinitializer, i64 6)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP7]], <2 x double> zeroinitializer, i64 6)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> poison, <4 x double> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 2, i32 3, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x double> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    br label [[FOR_END47:%.*]]
 ; CHECK:       for.end47:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <8 x double> [ [[TMP11]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x double> [ <double 0x7FF8000000000000, double 0x7FF8000000000000, double 0.000000e+00, double 0.000000e+00, double 0x7FF8000000000000, double 0x7FF8000000000000, double 0.000000e+00, double 0.000000e+00>, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 4dd659a7ae802..510cf45edbb52 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -20,10 +20,10 @@ define void @test(ptr %mdct_forward_x) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
 ;
 ; POW2-ONLY-LABEL: @dot_product_i32(
 ; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
 ; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
 ; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
 ; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
 ; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
 ; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
 ;
@@ -568,21 +563,16 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
 ;
 ; POW2-ONLY-LABEL: @dot_product_i32_reorder(
 ; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
 ; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
 ; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
 ; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
 ; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
 ; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
 ; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
 ;
@@ -630,9 +620,7 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
 ; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
 ; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
 ; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
 ; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
 ; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
@@ -682,9 +670,7 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
 ; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
 ; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
 ; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
 ; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
 ; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
@@ -733,9 +719,7 @@ define double @dot_product_fp64(ptr %a, ptr %b) {
 ; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
 ; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
 ; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
 ; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
 ; POW2-ONLY-NEXT:    ret double [[ADD_1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
index 9e6270376ddd4..0d1de729bf18c 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -13,7 +13,8 @@ define void @foo() {
 ; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 undef, i32 poison, i32 poison>, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 undef, i32 undef>, i32 [[ADD277]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], splat (i32 6)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
index 5681fb7346124..dbeff25954085 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
@@ -16,8 +16,10 @@ define void @test(i32 %0, i64 %1, i32 %2, i32 %3, ptr %4) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP98]], i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP73]], i32 6
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP17]], <2 x i32> [[TMP8]], i64 2)
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP18]], <2 x i32> [[TMP8]], i64 4)
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <60 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 60, i32 61, i32 poison, i32 poison, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> poison, <60 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor <60 x i32> [[TMP12]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP130:%.*]] = call i32 @llvm.vector.reduce.or.v60i32(<60 x i32> [[TMP21]])
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
index 1dd6c7b81fb73..3f4436f33fad6 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
@@ -25,23 +25,19 @@ define void @e(<4 x i16> %0) {
 ;
 ; THRESH-LABEL: @e(
 ; THRESH-NEXT:  entry:
-; THRESH-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; THRESH-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP1]], <4 x i16> zeroinitializer, i64 4)
-; THRESH-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 4)
-; THRESH-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; THRESH-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4)
-; THRESH-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP5]], <4 x i16> zeroinitializer, i64 8)
-; THRESH-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 12)
 ; THRESH-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; THRESH:       vector.body:
 ; THRESH-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[VECTOR_BODY]] ]
 ; THRESH-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; THRESH-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP3]], <4 x i16> [[VEC_IND]], i64 0)
-; THRESH-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP2]], [[TMP8]]
+; THRESH-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[VEC_IND]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 0, i16 0>, <8 x i16> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; THRESH-NEXT:    [[TMP3:%.*]] = add <8 x i16> zeroinitializer, [[TMP8]]
 ; THRESH-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THRESH-NEXT:    [[TMP11:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP10]], <4 x i16> [[TMP0:%.*]], i64 4)
-; THRESH-NEXT:    [[TMP12:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP11]], <8 x i16> [[TMP9]], i64 8)
-; THRESH-NEXT:    [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]]
+; THRESH-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0:%.*]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; THRESH-NEXT:    [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP9]], zeroinitializer
 ; THRESH-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; THRESH-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
 ; THRESH-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 65e5458b25d2f..6be51062f6fa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -13,7 +13,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -22,7 +23,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -77,7 +79,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -86,7 +89,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -143,7 +147,8 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
 ; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
 ; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
@@ -155,7 +160,8 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
 ; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
@@ -233,7 +239,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -242,7 +249,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index fad46870ec475..1db428706047a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -13,7 +13,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -22,7 +23,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -77,7 +79,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -86,7 +89,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -143,7 +147,8 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
 ; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
 ; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
@@ -155,7 +160,8 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
 ; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
@@ -233,7 +239,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -242,7 +249,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 99b13bdc05082..06498563a7d37 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -15,7 +15,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -27,7 +28,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -39,7 +41,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -99,7 +102,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -111,7 +115,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -123,7 +128,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -135,7 +141,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index 7f9475917b566..6275d984295c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -15,7 +15,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -27,7 +28,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -39,7 +41,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -99,7 +102,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -111,7 +115,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -123,7 +128,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
@@ -135,7 +141,8 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index 11ab7770a5383..d02df1ac92b4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -15,7 +15,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
@@ -26,7 +27,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -143,7 +145,8 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
@@ -154,7 +157,8 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -217,7 +221,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
 ; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -226,7 +231,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -592,7 +598,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
@@ -603,7 +610,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 9589ec24d49d4..d9a7586ecd23d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -15,7 +15,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
@@ -26,7 +27,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -143,7 +145,8 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
 ; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
@@ -154,7 +157,8 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -217,7 +221,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
 ; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -226,7 +231,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
 ; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -592,7 +598,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
@@ -603,7 +610,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
index f2992cf044cd5..e1ee35217d187 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
@@ -40,9 +40,10 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
 ; SSE-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
 ; SSE-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0)
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0)
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP12]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
 ; SSE-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]]
 ; SSE-NEXT:    [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]]
@@ -75,9 +76,10 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
 ; AVX-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
 ; AVX-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
-; AVX-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0)
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
-; AVX-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0)
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
 ; AVX-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]]
 ; AVX-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]]
@@ -110,9 +112,10 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; AVX512-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
 ; AVX512-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
 ; AVX512-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
-; AVX512-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0)
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
-; AVX512-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0)
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
 ; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]]
 ; AVX512-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-reused-with-bv-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-reused-with-bv-subvector.ll
index 5d2f059a8cf41..ff0887cf12447 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-reused-with-bv-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-reused-with-bv-subvector.ll
@@ -16,7 +16,8 @@ define void @test(ptr %0, i64 %1, i64 %2) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i64> poison, i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i64> [[TMP9]], i64 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP11]], <4 x i64> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP8]]
 ; CHECK-NEXT:    br [[DOTPREHEADER_US_US:label %.*]]
 ; CHECK:       [[_PREHEADER_US_US:.*:]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 7ed5f33c9dc6c..07fdc9d8dd2fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -17,7 +17,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
index fa46bd3d83249..c8748f316f024 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
@@ -24,7 +24,8 @@ define void @test(ptr %0, i32 %add651) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP13]], <2 x i32> [[TMP10]], i64 2)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP15:%.*]] = lshr <4 x i32> [[TMP14]], splat (i32 1)
 ; CHECK-NEXT:    [[SHR685:%.*]] = lshr i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i16>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index f16c879c451c2..4a8af6d03da06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
@@ -347,14 +347,30 @@ define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 }
 
 define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot2f64_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    ret double [[DOT01]]
+; SSE2-LABEL: @dot2f64_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
+; SSE2-NEXT:    ret double [[TMP4]]
+;
+; SSE4-LABEL: @dot2f64_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; SSE4-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    ret double [[DOT01]]
+;
+; AVX-LABEL: @dot2f64_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
+; AVX-NEXT:    ret double [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -369,14 +385,30 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
 }
 
 define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot2f32_fast(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    ret float [[DOT01]]
+; SSE2-LABEL: @dot2f32_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
+; SSE2-NEXT:    ret float [[TMP4]]
+;
+; SSE4-LABEL: @dot2f32_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE4-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    ret float [[DOT01]]
+;
+; AVX-LABEL: @dot2f32_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; AVX-NEXT:    ret float [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 9d48e7f8a787a..bfb623ac5a9b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -34,7 +34,8 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <2 x float> [[TMP22]], i64 2)
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
index 55fe7d6ed52e5..77585965d68e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
@@ -16,7 +16,8 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <2 x i32> <i32 0, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt <2 x i32> [[TMP7]], <i32 33554431, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> <i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <2 x i1> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i1> <i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i1> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> zeroinitializer, <8 x i32> <i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i32> [[TMP5]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i8>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll
index 20d7ba99fd515..3bf73034a1718 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll
@@ -17,7 +17,8 @@ define i32 @test(ptr %c, i16 %a, i16 %0) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i16 [[A]], -2
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP11]], <4 x i1> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = freeze <8 x i1> [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
index 0e08ef4d74308..18e03df0fbcc9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
@@ -10,7 +10,7 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 poison, i64 poison, i64 poison>, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 1, i32 2, i32 2, i32 3, i32 3, i32 3, i32 2, i32 1>
@@ -19,9 +19,10 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP8]], <8 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[RDX_OP]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP18]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP17]])
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
index 992909fb3e87f..15ba98f90f0b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
@@ -15,8 +15,9 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 23, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 26, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <12 x i32> [[TMP3]], <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 23, i32 24, i32 25, i32 26, i32 2, i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
index 2a54ae9a1e749..ce65f532e0b3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
@@ -11,6 +11,7 @@ define i32 @test(i64 %l.549) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[L_549]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[IF_THEN19:.*]]
 ; CHECK:       [[P:.*]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
@@ -18,20 +19,21 @@ define i32 @test(i64 %l.549) {
 ; CHECK-NEXT:    br i1 false, label %[[S:.*]], label %[[Q:.*]]
 ; CHECK:       [[Q]]:
 ; CHECK-NEXT:    [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP18]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]]
 ; CHECK:       [[LOR_LHS_FALSE]]:
 ; CHECK-NEXT:    br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]]
 ; CHECK:       [[R]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i64> [ [[TMP7]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i64> [ [[TMP19]], %[[Q]] ], [ [[TMP20:%.*]], %[[IF_THEN19]] ]
 ; CHECK-NEXT:    br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]]
 ; CHECK:       [[LAND_LHS_TRUE]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i64> [ [[TMP18]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i64> [ [[TMP21]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
 ; CHECK-NEXT:    br i1 false, label %[[Q]], label %[[S]]
 ; CHECK:       [[S]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP19]], %[[LAND_LHS_TRUE]] ], [ [[TMP18]], %[[R]] ], [ [[TMP7]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP22]], %[[LAND_LHS_TRUE]] ], [ [[TMP21]], %[[R]] ], [ [[TMP19]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[IF_THEN19]]
 ; CHECK:       [[IF_THEN19]]:
@@ -39,7 +41,7 @@ define i32 @test(i64 %l.549) {
 ; CHECK-NEXT:    [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> [[TMP2]], i64 2)
+; CHECK-NEXT:    [[TMP20]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    br i1 false, label %[[R]], label %[[IF_END25]]
 ; CHECK:       [[IF_END25]]:
 ; CHECK-NEXT:    br i1 false, label %[[IF_END29]], label %[[P]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
index 19c29be1ef384..4f62a8d24387f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -18,7 +18,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP9]], <2 x float> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP11]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 2484a2d2193fc..eaa77d74f8df1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -605,9 +605,10 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[RDX_OP2:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v24f32(<24 x float> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast <24 x float> @llvm.vector.insert.v24f32.v4f32(<24 x float> [[TMP0]], <4 x float> [[RDX_OP3]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]])
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
 ; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
@@ -623,9 +624,10 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT:    [[RDX_OP2:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v24f32(<24 x float> [[TMP0]], i64 0)
+; THRESHOLD-NEXT:    [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; THRESHOLD-NEXT:    [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]]
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast <24 x float> @llvm.vector.insert.v24f32.v4f32(<24 x float> [[TMP0]], <4 x float> [[RDX_OP3]], i64 0)
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]])
 ; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
 ; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index ca662b838938f..b7bd3e41b0d29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -977,9 +977,12 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE4:       pp:
 ; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
 ; SSE4-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 0)
-; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP8]], i64 4)
-; SSE4-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE4-NEXT:    [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
 ; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
@@ -989,8 +992,9 @@ define i32 @maxi8_wrong_parent(i32) {
 ; AVX:       pp:
 ; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 0)
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 2)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    [[RDX_OP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[TMP6]]
 ; AVX-NEXT:    [[RDX_OP1:%.*]] = select <4 x i1> [[RDX_OP]], <4 x i32> [[TMP4]], <4 x i32> [[TMP6]]
 ; AVX-NEXT:    [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_OP1]])
@@ -1002,9 +1006,12 @@ define i32 @maxi8_wrong_parent(i32) {
 ; THRESH:       pp:
 ; THRESH-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
 ; THRESH-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
-; THRESH-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4)
-; THRESH-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
+; THRESH-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
 ; THRESH-NEXT:    ret i32 [[TMP8]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index d6f0b7692bdd9..f07424f0d2934 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -20,8 +20,10 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP17:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v24i32(<64 x i32> [[TMP16]], <24 x i32> [[TMP6]], i64 24)
-; CHECK-NEXT:    [[TMP18:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> [[TMP17]], <16 x i32> [[TMP4]], i64 16)
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
index 80b62c3cfffac..0fddb7322e9b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
@@ -15,14 +15,17 @@ define <16 x double> @test(ptr %x, double %v, double %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v6f64(<16 x double> poison, <6 x double> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP12]], <2 x double> [[TMP6]], i64 6)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP13]], <2 x double> [[TMP7]], i64 8)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP14]], <2 x double> [[TMP9]], i64 10)
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP15]], <2 x double> [[TMP9]], i64 12)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP16]], <2 x double> [[TMP9]], i64 14)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x double> [[TMP12]], <16 x double> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x double> [[TMP14]], <16 x double> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x double> [[TMP16]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x double> [[TMP21]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x double> [[TMP19]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]]
 ; CHECK-NEXT:    ret <16 x double> [[TMP18]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
index 54c950a078502..48b657e8bf6e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
@@ -11,7 +11,8 @@ define void @inst_size(ptr %a, <2 x i64> %b) {
 ; CHECK-NEXT:    [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]]
 ; CHECK-NEXT:    br label [[BLOCK:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll
index d6552adbd4abf..6c729d17c1a9b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/landing-pad-for-split-node.ll
@@ -29,14 +29,15 @@ define void @test(i32 %arg) personality ptr null {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[PHI6]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[PHI7]], i32 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    br label %[[BB11:.*]]
 ; CHECK:       [[BB9:.*]]:
 ; CHECK-NEXT:    [[LANDINGPAD10:%.*]] = landingpad { ptr, i32 }
 ; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    br label %[[BB11]]
 ; CHECK:       [[BB11]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <8 x i32> [ poison, %[[BB9]] ], [ [[TMP9]], %[[BB5]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <8 x i32> [ poison, %[[BB9]] ], [ [[TMP10]], %[[BB5]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
index ccb7e9b514cf1..842bd6c6bec37 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
@@ -108,9 +108,10 @@ define i64 @test_3() #0 {
 ; CHECK-NEXT:    [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <32 x i32> poison, i32 [[VAL4]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <28 x i32> @llvm.vector.extract.v28i32.v32i32(<32 x i32> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = mul <28 x i32> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v28i32(<32 x i32> [[TMP1]], <28 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <28 x i32> [[RDX_OP]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> [[TMP7]], <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[OP_RDX27:%.*]] = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[VAL64:%.*]] = add i32 3, [[OP_RDX27]]
 ; CHECK-NEXT:    [[VAL65:%.*]] = sext i32 [[VAL64]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
index 289c6002851d7..f56af934f19f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
@@ -22,9 +22,12 @@ define i32 @test(i32 %s.0) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP2]], i64 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP9]], <2 x i32> [[TMP3]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP10]], <2 x i32> [[TMP5]], i64 6)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x i32> [[TMP27]], <8 x i32> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24:.*]], label %[[IF_THEN11:.*]]
 ; CHECK:       [[IF_THEN11]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -37,11 +40,11 @@ define i32 @test(i32 %s.0) {
 ; CHECK:       [[IF_THEN18:.*]]:
 ; CHECK-NEXT:    br label %[[T]]
 ; CHECK:       [[T]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi <8 x i32> [ [[TMP27:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
 ; CHECK-NEXT:    [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24]], label %[[K]]
 ; CHECK:       [[IF_END24]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP11]], %[[IF_END6]] ], [ [[TMP30]], %[[T]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP34]], %[[T]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 7, i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
@@ -52,7 +55,8 @@ define i32 @test(i32 %s.0) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> <i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP26]], <4 x i32> [[TMP23]], i64 4)
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP33]] = shufflevector <8 x i32> [[TMP26]], <8 x i32> [[TMP32]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP28]] = extractelement <4 x i32> [[TMP24]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[T]], label %[[IF_END6]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
index ea497c95d4114..1abc8102dc332 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -22,7 +22,8 @@ define i32 @bar() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP18]], <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 76104efc1bb78..6da0ecef5cd96 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -8,7 +8,7 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[D_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v2i1(<16 x i1> poison, <2 x i1> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
index f7d78be4f13ca..a9f2ed61d9ee4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
@@ -18,8 +18,8 @@ define i64 @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 1, i32 0>, i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 1>, i64 24)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> <i32 0, i32 0, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <32 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP4]], 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
index e9a65bf6d6f0d..7df97492b874b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
@@ -10,7 +10,8 @@ define i1 @foo() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> <i1 false, i1 false, i1 undef, i1 undef>, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <4 x i1> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i1> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-2-num-elems-reused.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-2-num-elems-reused.ll
index 4ad02d47fb385..f1bd3384f0488 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-2-num-elems-reused.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-2-num-elems-reused.ll
@@ -6,7 +6,7 @@ define i64 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[OR54_I_I_6:%.*]] = or i32 0, 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[OR54_I_I_6]], i32 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 7, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i32> [[TMP2]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP3]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
index 355f5306ee4db..04359eb6fcd7c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
@@ -16,8 +16,10 @@ define void @e(ptr %c, i64 %0) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <6 x ptr> [[TMP8]], <6 x ptr> [[TMP19]], <6 x i32> <i32 6, i32 7, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x ptr> [[TMP6]], <2 x ptr> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <6 x ptr> [[TMP20]], <6 x ptr> [[TMP21]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 70b7f14a3a2c9..1fedde4cc9fd7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -7,7 +7,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v12i64(<16 x i64> poison, <12 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
index 382d6ae0e0a6f..652abef14771d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
@@ -12,11 +12,11 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 false to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> [[TMP2]], i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> [[TMP2]], i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <4 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    br i1 false, label %[[BB5]], label %[[BB2:.*]]
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
 ; CHECK-NEXT:    br label %[[BB2]]
 ; CHECK:       [[BB2]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-split-node.ll
index eaf7bb2c9fdce..98ea4db6f6492 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-split-node.ll
@@ -17,7 +17,8 @@ define i64 @test(i256 %0, { i32, i1 } %1) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP13]], <4 x i32> [[TMP12]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i1> [[TMP15]] to i8
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP16]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
index 232e458504188..7206293444d55 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
@@ -11,7 +11,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i1> [ poison, %[[CONT221_THREAD781]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> zeroinitializer, <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false>, <4 x i1> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> <i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false>, <8 x i1> [[TMP7]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = or i64 0, [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll
index 048d2814b9abb..d62623047763f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll
@@ -8,19 +8,20 @@ define void @test(ptr %0, i1 %1, i1 %2) {
 ; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], %[[TMP7:.*]] ], [ zeroinitializer, [[TMP3:%.*]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[TMP7]], label %[[BB14:.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[TMP7]], label %[[BB15:.*]]
 ; CHECK:       [[TMP7]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i32>, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i32> [[TMP10]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP12]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> [[TMP11]], i64 2)
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB15:.*]], label %[[BB4]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    br label %[[BB15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB16:.*]], label %[[BB4]]
 ; CHECK:       [[BB15]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i32> [ [[TMP6]], %[[BB14]] ], [ [[TMP13]], %[[TMP7]] ]
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i32> [ [[TMP6]], %[[BB15]] ], [ [[TMP14]], %[[TMP7]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load volatile ptr, ptr null, align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 176
 ; CHECK-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP18]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll
index 5baa5f3cdcdae..e35491823cc55 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll
@@ -4,10 +4,7 @@
 define i16 @test() {
 ; CHECK-LABEL: define i16 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i16> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> zeroinitializer, <4 x i16> [[RDX_OP]], i64 0)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i16 [[OP_RDX]], 0
 ; CHECK-NEXT:    ret i16 [[OP_RDX1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 81da11dc42e88..1904540c23146 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -210,7 +210,8 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret i1 [[TMP6]]
@@ -244,7 +245,8 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
 ; CHECK-NEXT:    call void @use1(i1 [[TMP5]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]])
 ; CHECK-NEXT:    ret i1 [[TMP7]]
@@ -316,7 +318,8 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <4 x i32> [[TMP2]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
@@ -392,7 +395,7 @@ define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 15>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
index 799533824c5aa..fe5f4deecb8b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
@@ -9,19 +9,16 @@ define i32 @test(i32 %arg) {
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP7]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = mul <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[OP_RDX]] = mul i32 0, [[TMP13]]
 ; CHECK-NEXT:    br label %[[BB1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
index 8aaa71ef47a8c..c258c7d54df82 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
@@ -20,7 +20,8 @@ define <4 x float> @test(ptr %x, float %v, float %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP8]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 02058b1fe8578..19ce11c457f63 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -15,7 +15,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -75,7 +76,8 @@ define void @test1() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP14]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
@@ -137,7 +139,8 @@ define void @test_div() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], <i32 2, i32 1, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -197,7 +200,8 @@ define void @test_rem() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], <i32 1, i32 1, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-subvector.ll
index af9d808f45fa1..3f6ec8ccad4ee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-subvector.ll
@@ -56,7 +56,8 @@ define void @test(i32 %j.6, i32 %m.4, i8 %v.5, ptr %a, i1 %tobool14.not) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[SUB13]], i32 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP27]], <4 x i32> [[TMP23]], i64 4)
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i32> [[TMP27]], <8 x i32> [[TMP34]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP28]], <8 x i32> poison, <2 x i32> <i32 poison, i32 6>
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP29]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    br i1 [[TOBOOL14_NOT]], label %[[IF_END18]], label %[[Q]]
@@ -68,11 +69,13 @@ define void @test(i32 %j.6, i32 %m.4, i8 %v.5, ptr %a, i1 %tobool14.not) {
 ; CHECK-NEXT:    [[CONV17:%.*]] = sext i8 [[V_44]] to i32
 ; CHECK-NEXT:    [[REM:%.*]] = mul i32 [[U_4]], [[CONV17]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 0>, i32 [[REM]], i32 5
-; CHECK-NEXT:    [[TMP34:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP33]], <4 x i32> [[TMP32]], i64 0)
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP32]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x i32> [[TMP31]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <8 x i32> [[TMP33]], <8 x i32> [[TMP39]], <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    br label %[[IF_END18]]
 ; CHECK:       [[IF_END18]]:
 ; CHECK-NEXT:    [[L_4]] = phi i8 [ 0, %[[Q]] ], [ [[L_3_PH]], %[[O]] ]
-; CHECK-NEXT:    [[TMP35]] = phi <8 x i32> [ [[TMP34]], %[[Q]] ], [ [[TMP28]], %[[O]] ]
+; CHECK-NEXT:    [[TMP35]] = phi <8 x i32> [ [[TMP40]], %[[Q]] ], [ [[TMP28]], %[[O]] ]
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <8 x i32> [[TMP35]], <8 x i32> poison, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP36]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[TOBOOL14_NOT]], label %[[N]], label %[[P]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
index 52e13de8118d7..61294089fd4cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
@@ -16,9 +16,10 @@ define void @test(i32 %0, ptr %p) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <8 x i32> [ [[TMP8]], [[ENTRY:%.*]] ], [ [[TMP6]], [[PH]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP9]], <4 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP12]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[OP_RDX5:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i32 [[OP_RDX5]], [[OP_RDX]]
 ; CHECK-NEXT:    store i32 [[OP_RDX2]], ptr [[P]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index ef1149a108e29..20a42777cf8e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -7,7 +7,7 @@
 define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(i1 %arg) unnamed_addr #0 align 2 {
 ; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
 ; CHECK:       if.then22.i:
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
@@ -24,11 +24,14 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(i
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SHR_I_I]] to i8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP10]], <8 x i8> [[TMP11]], i64 8)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP12]], <4 x i8> [[TMP13]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v2i8(<16 x i8> [[TMP14]], <2 x i8> [[TMP15]], i64 2)
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i8> [[TMP15]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i8> [[TMP16]], splat (i8 1)
 ; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr undef, align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
index e56131b4681e3..92a1e289044d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
@@ -32,14 +32,12 @@ define <16 x half> @test(i32 %0, float %1, i32 %2) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = sitofp <16 x i32> [[TMP28]] to <16 x float>
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP29]], <16 x float> zeroinitializer, <16 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP31:%.*]] = fadd <16 x float> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = call <12 x i1> @llvm.vector.insert.v12i1.v2i1(<12 x i1> poison, <2 x i1> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <12 x i1> [[TMP32]], <12 x i1> <i1 poison, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <12 x i1> [[TMP33]], <12 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 0, i32 10, i32 11, i32 0>
-; CHECK-NEXT:    [[TMP35:%.*]] = select <16 x i1> [[TMP34]], <16 x float> zeroinitializer, <16 x float> [[TMP31]]
+; CHECK-NEXT:    [[TMP35:%.*]] = select <16 x i1> zeroinitializer, <16 x float> zeroinitializer, <16 x float> [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x float> [[TMP35]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <16 x float>
-; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <2 x float> [[TMP6]], i64 14)
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <16 x float> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> zeroinitializer, <16 x float> [[TMP38]], <16 x float> [[TMP39]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP29]], i32 0
 ; CHECK-NEXT:    [[TMP42:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-SplitVectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-SplitVectorize.ll
index 976de7cc8c21f..f98ed81b087b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-SplitVectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-SplitVectorize.ll
@@ -4,30 +4,25 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> poison, <4 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP0]], <4 x i32> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP1]], <4 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP2]], <4 x i32> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP3]], <4 x i32> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP4]], <4 x i32> zeroinitializer, i64 20)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP5]], <4 x i32> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP6]], <4 x i32> zeroinitializer, i64 28)
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <32 x i32> [[TMP7]] to <32 x i1>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <32 x i1> [ [[TMP8]], [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <32 x i1> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NARROW:%.*]] = select <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[NARROW66:%.*]] = select <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[NARROW67:%.*]] = select <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[NARROW68:%.*]] = select <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> poison, <4 x i1> [[NARROW]], i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP10]], <4 x i1> [[NARROW66]], i64 4)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP11]], <4 x i1> [[NARROW67]], i64 8)
-; CHECK-NEXT:    [[TMP19:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP12]], <4 x i1> [[NARROW68]], i64 12)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP19]], <4 x i1> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP14]], <4 x i1> zeroinitializer, i64 20)
-; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP15]], <4 x i1> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x i1> @llvm.vector.insert.v32i1.v4i1(<32 x i1> [[TMP16]], <4 x i1> zeroinitializer, i64 28)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[NARROW]], <4 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i1> [[NARROW66]], <4 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 33, i32 34, i32 35, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[NARROW67]], <4 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i1> [[TMP3]], <32 x i1> [[TMP4]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i1> [[NARROW68]], <4 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i1> [[TMP5]], <32 x i1> [[TMP6]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 32, i32 33, i32 34, i32 35, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> zeroinitializer, <4 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i1> [[TMP7]], <32 x i1> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i1> [[TMP9]], <32 x i1> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 32, i32 33, i32 34, i32 35, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i1> [[TMP10]], <32 x i1> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i1> [[TMP11]], <32 x i1> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 34, i32 35>
 ; CHECK-NEXT:    [[TMP18]] = or <32 x i1> [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-getStoreMinimumVF.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-getStoreMinimumVF.ll
index 3aea112e9edfe..14bdcd062edf8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-getStoreMinimumVF.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-getStoreMinimumVF.ll
@@ -4,9 +4,7 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP0]], <4 x i8> zeroinitializer, i64 4)
-; CHECK-NEXT:    store <8 x i8> [[TMP1]], ptr null, align 1
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr null, align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
index 069274df396d7..4990fe102564a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
@@ -8,10 +8,7 @@ define void @test(ptr %in) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[IN]], i64 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP1]], i32 2, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP4]], <8 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index 3d0e6be661fd1..8f6a53c03ac68 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -4,31 +4,17 @@
 define <4 x i16> @test() {
 ; CHECK-LABEL: define <4 x i16> @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]])
+; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]])
+; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
-; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
index 3b9222b7d5ed1..9c0f65ec27165 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -35,13 +35,15 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP21]], <4 x i32> [[TMP10]], i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i32> [[TMP21]], <8 x i32> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> <i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 1>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 5, i32 6, i32 15>
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP19]], i64 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i32> [[TMP20]], [[TMP16]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP19]], <4 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[RDX_OP]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> [[TMP25]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]])
 ; CHECK-NEXT:    ret i32 [[TMP17]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 4cf2f99e60aeb..8dc8db9b444dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -89,8 +89,9 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add
 ; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
@@ -147,10 +148,13 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll
index 10e73b042f19b..f6bf138944749 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll
@@ -18,20 +18,21 @@ define void @test(double %0) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP11]], <2 x double> [[TMP10]], i64 4)
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x double> [[TMP11]], <6 x double> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:    br i1 false, label %[[DOTLR_PH272_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
 ; CHECK:       [[_LR_PH272_PREHEADER:.*:]]
-; CHECK-NEXT:    br i1 false, [[DOT_CRIT_EDGE]], label %[[BB13:.*]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 false, [[DOT_CRIT_EDGE]], label %[[BB14:.*]]
+; CHECK:       [[BB14]]:
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP15]], <2 x double> splat (double 0x7FF8000000000000), i64 4)
-; CHECK-NEXT:    br i1 false, label %[[BB17:.*]], [[DOT_CRIT_EDGE]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP15]], <6 x double> <double 0x7FF8000000000000, double 0x7FF8000000000000, double undef, double undef, double undef, double undef>, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    br i1 false, label %[[BB18:.*]], [[DOT_CRIT_EDGE]]
+; CHECK:       [[BB18]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <6 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP0]], i32 3
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi <6 x double> [ [[TMP12]], %[[BB7]] ], [ [[TMP18]], %[[BB17]] ], [ [[TMP16]], %[[BB13]] ], [ [[TMP12]], %[[DOTLR_PH272_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <6 x double> [ [[TMP12]], %[[BB7]] ], [ [[TMP18]], %[[BB18]] ], [ [[TMP17]], %[[BB14]] ], [ [[TMP12]], %[[DOTLR_PH272_PREHEADER]] ]
 ; CHECK-NEXT:    ret void
 ;
 .thread:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
index 9abb994db1e73..680f950fae975 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
@@ -15,7 +15,8 @@ define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4)
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> <i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 14, i32 14, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP9]], i32 15
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul <16 x float> [[TMP18]], [[TMP17]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-parent-operands-in-spill.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-parent-operands-in-spill.ll
index 5491e8ea7e0f8..cd3663e28eb75 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-parent-operands-in-spill.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-parent-operands-in-spill.ll
@@ -28,10 +28,11 @@ define void @test(i32 %arg) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LOAD3]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[LOAD2]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP5]], <4 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    br label %[[BB12]]
 ; CHECK:       [[BB12]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <8 x i32> [ [[TMP6]], %[[BB8]] ], [ poison, %[[BB6]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <8 x i32> [ [[TMP7]], %[[BB8]] ], [ poison, %[[BB6]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[BB21]]:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
index 5bfbd69330564..8e09847e9264e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
@@ -27,7 +27,8 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = sitofp <2 x i32> [[TMP24]] to <2 x float>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP27]], <4 x float> [[TMP16]], i64 4)
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP51]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP29:%.*]] = fdiv <8 x float> zeroinitializer, [[TMP28]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x float> [[TMP30]] to <8 x i32>
@@ -50,19 +51,21 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 0, i64 8388608
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[TMP32]], i32 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i64 0, i64 32768
-; CHECK-NEXT:    br label %[[BB52:.*]]
-; CHECK:       [[BB51:.*]]:
-; CHECK-NEXT:    unreachable
-; CHECK:       [[BB52]]:
 ; CHECK-NEXT:    br label %[[BB53:.*]]
+; CHECK:       [[BB52:.*]]:
+; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB53]]:
+; CHECK-NEXT:    br label %[[BB54:.*]]
+; CHECK:       [[BB54]]:
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 0, ptr null)
 ; CHECK-NEXT:    [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP58:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP57]], <2 x float> [[TMP55]], i64 0)
-; CHECK-NEXT:    [[TMP59:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP58]], <2 x float> [[TMP54]], i64 6)
+; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <2 x float> [[TMP55]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <8 x float> [[TMP57]], <8 x float> [[TMP87]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <2 x float> [[TMP54]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <8 x float> [[TMP88]], <8 x float> [[TMP89]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x float> [[TMP59]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ult <8 x i32> [[TMP60]], splat (i32 1325400064)
 ; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <8 x i1> [[TMP61]], i32 5
@@ -94,7 +97,7 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP85:%.*]] = or i64 [[TMP84]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = or i64 [[TMP85]], [[TMP81]]
 ; CHECK-NEXT:    store i64 [[TMP86]], ptr null, align 1
-; CHECK-NEXT:    br label %[[BB51]]
+; CHECK-NEXT:    br label %[[BB52]]
 ;
   %5 = and i64 %2, 255
   %6 = and i64 %2, -65536
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
index dd804663ff121..972a58cecc822 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-vector-operand-with-reuses.ll
@@ -10,11 +10,13 @@ define void @test(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <4 x i32> [[TMP2]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <12 x i32> @llvm.vector.insert.v12i32.v4i32(<12 x i32> [[TMP6]], <4 x i32> [[TMP5]], i64 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> [[TMP20]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
index d07353798edc9..3bafc3c6552f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
@@ -15,12 +15,14 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP30]], <2 x i32> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP30]], <4 x i32> [[TMP31]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = uitofp <4 x i32> [[TMP10]] to <4 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fdiv <4 x float> zeroinitializer, [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> poison, i1 [[V4]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP14]], <2 x i1> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> [[TMP32]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x float> zeroinitializer, <4 x float> [[TMP12]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP16]], i32 3
 ; CHECK-NEXT:    [[CONV_I_I1743_3:%.*]] = fptoui float [[TMP17]] to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
index 3eabed5882e58..6073a264b9b12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -160,7 +160,8 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]]
 ; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll
index 4b62ef688ca44..4c295355617e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/trunc-node-reused.ll
@@ -4,16 +4,7 @@
 define i16 @test() {
 ; CHECK-LABEL: define i16 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> <i1 false, i1 false, i1 poison, i1 poison>, <2 x i1> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> zeroinitializer, <4 x i1> [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i1> [[TMP7]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    ret i16 [[TMP9]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
index a821362a883a1..fd3c1a57aff34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POW2-NEXT:  entry:
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
 ; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
 ; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POW2-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index c30f94159916a..32e59697486a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -16,17 +16,19 @@ define i1 @test(float %0, double %1) {
 ; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 ; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
-; X86-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP13]], i64 0)
-; X86-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v6f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <6 x double> [[TMP8]], i64 0)
-; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP14]], [[TMP15]]
-; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP14]], [[TMP15]]
-; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
-; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
-; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
-; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
-; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
-; X86-NEXT:    ret i1 [[TMP23]]
+; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
+; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
+; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
+; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
+; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
+; X86-NEXT:    ret i1 [[TMP25]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
@@ -42,17 +44,19 @@ define i1 @test(float %0, double %1) {
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
 ; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
-; AARCH64-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP14]], i64 0)
-; AARCH64-NEXT:    [[TMP16:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v6f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <6 x double> [[TMP8]], i64 0)
-; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP15]], [[TMP16]]
-; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP15]], [[TMP16]]
-; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
-; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
-; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
-; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
-; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
-; AARCH64-NEXT:    ret i1 [[TMP24]]
+; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
+; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
+; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
+; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
+; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
+; AARCH64-NEXT:    ret i1 [[TMP26]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
index dca34b681032c..a64075db37ba1 100644
--- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
@@ -9,9 +9,9 @@ define void @test() {
 ; CHECK:       body:
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; CHECK-NEXT:    [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
-; CHECK-NEXT:    [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
-; CHECK-NEXT:    [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP8]]
+; CHECK-NEXT:    [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]])
 ; CHECK-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
 ; CHECK-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
index a42c8f2c650ae..fff988a0a746e 100644
--- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
@@ -28,10 +28,14 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <64 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 poison, i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison>, i1 [[CMP13_NOT_5]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call <64 x i1> @llvm.vector.insert.v64i1.v8i1(<64 x i1> [[TMP21]], <8 x i1> [[TMP8]], i64 8)
-; CHECK-NEXT:    [[TMP23:%.*]] = call <64 x i1> @llvm.vector.insert.v64i1.v8i1(<64 x i1> [[TMP22]], <8 x i1> [[TMP20]], i64 56)
-; CHECK-NEXT:    [[TMP24:%.*]] = call <64 x i1> @llvm.vector.insert.v64i1.v4i1(<64 x i1> [[TMP23]], <4 x i1> [[TMP11]], i64 32)
-; CHECK-NEXT:    [[TMP25:%.*]] = call <64 x i1> @llvm.vector.insert.v64i1.v2i1(<64 x i1> [[TMP24]], <2 x i1> [[TMP3]], i64 6)
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <64 x i1> [[TMP21]], <64 x i1> [[TMP22]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <64 x i1> [[TMP23]], <64 x i1> [[TMP24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <64 x i1> [[TMP29]], <64 x i1> [[TMP30]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <64 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <64 x i1> [[TMP31]], <64 x i1> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 64, i32 65, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP25]], <64 x i32> zeroinitializer, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> [[TMP26]])
 ; CHECK-NEXT:    ret i32 [[TMP27]]
diff --git a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
index f8a6c4dab3d51..c0a0318efd19e 100644
--- a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
@@ -24,9 +24,10 @@ define i32 @test(i32 %v, ptr %p) {
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[I9_I_I]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <16 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v4i1(<16 x i1> [[TMP10]], <4 x i1> [[RDX_OP]], i64 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i1> [[RDX_OP]], <4 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> [[TMP15]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP13]])
 ; CHECK-NEXT:    [[AND252_US_I_24_I_I:%.*]] = select i1 [[OP_RDX]], i32 0, i32 0
 ; CHECK-NEXT:    br label %[[INC]]
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 3ef0de177b478..304af88b6d134 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -10,9 +10,10 @@ define i64 @test(ptr %p) {
 ; RISCV-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
 ; RISCV-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
 ; RISCV-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; RISCV-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; RISCV-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; RISCV-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
+; RISCV-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 0, i32 0>
+; RISCV-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 6, i32 7>
+; RISCV-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; RISCV-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; RISCV-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], splat (i64 42)
 ; RISCV-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; RISCV-NEXT:    ret i64 [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index caca410f056c1..8e71f884b3bb4 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -14,10 +14,10 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP11]], i32 30
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 30, i32 30>
-; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP16]], <2 x i32> zeroinitializer, i64 28)
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i32> [[TMP13]], <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i32> [[TMP15]], <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 poison, i32 poison, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i32> [[TMP16]], <32 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 32, i32 33, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 poison, i32 poison, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i32> [[TMP14]], <32 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i32> [[TMP8]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/revec-insertelement.ll
index 9dbaadeca1f41..1572b6ba3307d 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-insertelement.ll
@@ -42,7 +42,7 @@ define void @test_missing_lanes_1_3(ptr %ptr, i32 %val0, i32 %val1) {
 ; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 0
 ; CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr [[GETELEMENTPTR0]], align 4
 ; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[PTR]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[GETELEMENTPTR1]], align 4
 ; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[PTR]], i64 12
 ; CHECK-NEXT:    store <4 x i32> poison, ptr [[GETELEMENTPTR3]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
index 250c60a61fea1..5611fda2c0223 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll
@@ -32,7 +32,8 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 ; X86-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
 ; X86-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
 ; X86-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; X86-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
 ; X86-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
 ; X86-NEXT:    ret i1 [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index cec99c694391b..b738d25b39be1 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -7,9 +7,8 @@ define void @test1(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -20,9 +19,8 @@ define void @test1(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
 ; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -55,9 +53,8 @@ define void @test2(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -68,9 +65,8 @@ define void @test2(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
 ; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -101,18 +97,16 @@ entry:
 define void @test3(<16 x i32> %0, ptr %out) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
 ; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; COMBINE-LABEL: @test3(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0)
-; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
 ; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
@@ -138,9 +132,8 @@ define void @test4(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
@@ -149,9 +142,8 @@ define void @test4(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
-; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
@@ -174,20 +166,14 @@ entry:
 define void @test5(ptr %out) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0
-; CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; COMBINE-LABEL: @test5(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0)
-; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
-; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0
-; COMBINE-NEXT:    store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; COMBINE-NEXT:    store <8 x i32> zeroinitializer, ptr [[TMP3]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -214,7 +200,8 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP10]] to <32 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP14]], <4 x float> [[LOAD2]], i64 8)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <32 x float> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
@@ -222,18 +209,18 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; CHECK-NEXT:    store <32 x float> [[TMP4]], ptr [[IN2]], align 16
 ; CHECK-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0)
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
-; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0)
-; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP17]], i64 0)
-; CHECK-NEXT:    [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4)
-; CHECK-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP17]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP24]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <16 x float> [[TMP12]], [[TMP6]]
 ; CHECK-NEXT:    store <16 x float> [[TMP13]], ptr [[GEP11]], align 16
@@ -252,7 +239,8 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; COMBINE-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float>
 ; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP3]], <4 x float> [[LOAD2]], i64 8)
+; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
 ; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
@@ -260,18 +248,18 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; COMBINE-NEXT:    [[TMP13:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0)
-; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; COMBINE-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
 ; COMBINE-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
-; COMBINE-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0)
-; COMBINE-NEXT:    [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 0)
-; COMBINE-NEXT:    [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4)
-; COMBINE-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 4)
-; COMBINE-NEXT:    [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8)
+; COMBINE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; COMBINE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[TMP17:%.*]] = fmul <16 x float> [[TMP16]], [[TMP9]]
 ; COMBINE-NEXT:    store <16 x float> [[TMP17]], ptr [[GEP11]], align 16
@@ -365,40 +353,12 @@ entry:
 define i32 @test7() {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    store <16 x float> [[TMP13]], ptr null, align 16
+; CHECK-NEXT:    store <16 x float> zeroinitializer, ptr null, align 16
 ; CHECK-NEXT:    ret i32 0
 ;
 ; COMBINE-LABEL: @test7(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0)
-; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8)
-; COMBINE-NEXT:    [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]]
-; COMBINE-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]]
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; COMBINE-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
-; COMBINE-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4)
-; COMBINE-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8)
-; COMBINE-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
-; COMBINE-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]]
-; COMBINE-NEXT:    [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]]
-; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; COMBINE-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]]
-; COMBINE-NEXT:    store <16 x float> [[TMP13]], ptr null, align 16
+; COMBINE-NEXT:    store <16 x float> zeroinitializer, ptr null, align 16
 ; COMBINE-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index afe92f89ac0d1..ac8b10a0087d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -93,19 +93,15 @@ define void @test4(ptr %in, ptr %out) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP4]], <8 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <16 x float> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <16 x float> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fcmp ogt <16 x float> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <16 x float> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i1, ptr [[OUT:%.*]], i64 8
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i1> @llvm.vector.extract.v8i1.v16i1(<16 x i1> [[TMP11]], i64 8)
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <8 x i1> [[TMP13]], ptr [[OUT]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i1> @llvm.vector.extract.v8i1.v16i1(<16 x i1> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i1> [[TMP14]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -151,22 +147,14 @@ define <4 x i1> @test6(ptr %in1, ptr %in2) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[IN1:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[IN2:%.*]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> poison, <4 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> poison, <4 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP7]], <4 x i32> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP8]], <4 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP9]], <4 x i32> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> [[TMP1]], i64 0)
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i16> [[TMP15]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP17]], <4 x i16> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP18]], <4 x i16> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP19]], <4 x i16> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq <16 x i16> [[TMP16]], [[TMP20]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt <32 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i1> [[TMP6]], <32 x i1> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <32 x i1> [[TMP6]], <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <16 x i1> [[TMP24]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP25]])
@@ -217,10 +205,7 @@ entry:
 
 define void @test7() {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> poison, <8 x i64> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP1]], <8 x i64> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i64> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr null, align 2
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr null, align 2
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr i8, ptr null, i64 16
@@ -234,18 +219,12 @@ define void @test7() {
 define void @test8() {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> poison, <2 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP0]], <2 x float> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP1]], <2 x float> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP2]], <2 x float> zeroinitializer, i64 6)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2)
 ; CHECK-NEXT:    br i1 false, label [[FOR0:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for0:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP5]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP7]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP8]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label [[FOR0]], label [[FOR_BODY]]
 ;
@@ -268,13 +247,9 @@ for.body:
 define void @test9() {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
 ; CHECK-NEXT:    br label [[FOR_BODY13:%.*]]
 ; CHECK:       for.body13:
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr null, align 4
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr null, align 4
 ; CHECK-NEXT:    br label [[FOR_BODY13]]
 ;
 entry:
@@ -293,9 +268,8 @@ define void @test10() {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr null, align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -334,14 +308,13 @@ define void @test11(<2 x i64> %0, i64 %1, <2 x i64> %2) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> <i64 5, i64 0>, [[TMP2:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <2 x i64> [[TMP4]] to <2 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP6]], <2 x i16> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i16> [[TMP8]] to <4 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> poison, <2 x i8> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP10]], <2 x i8> zeroinitializer, i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = urem <4 x i8> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = urem <4 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -365,21 +338,15 @@ define void @test12() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> [[TMP7]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x float> [[TMP10]], <32 x float> [[TMP11]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <32 x double> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fcmp ogt <32 x float> zeroinitializer, [[TMP15]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -413,22 +380,17 @@ entry:
 define void @test13(<8 x i32> %0, ptr %out0, ptr %out1, ptr %out2) {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> poison, <8 x i32> [[TMP0:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> poison, <8 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP9]], <8 x i32> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP10]], <8 x i32> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP5]], <8 x i32> zeroinitializer, i64 24)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0:%.*]], <8 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v16i32(<16 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[OR0:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    store <4 x i32> [[OR0]], ptr [[OUT0:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v16i32(<16 x i32> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[OUT1:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v16i32(<16 x i32> [[TMP3]], i64 12)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[OUT2:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -454,19 +416,14 @@ for.end.loopexit:
 define void @test14(<8 x i1> %0) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v8i1(<16 x i1> poison, <8 x i1> [[TMP0:%.*]], i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0:%.*]], <8 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> poison, <8 x i16> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP9]], <8 x i16> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP10]], <8 x i16> zeroinitializer, i64 16)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i16> @llvm.vector.insert.v32i16.v8i16(<32 x i16> [[TMP7]], <8 x i16> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <16 x i16> [ [[TMP5]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.vector.extract.v4i16.v16i16(<16 x i16> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[OR0:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    ret void
 ;
@@ -496,15 +453,9 @@ define i32 @test15() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr null, align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP4]], <4 x float> zeroinitializer, i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP5]], <4 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 12)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <16 x float> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <16 x float> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    store <16 x float> [[TMP12]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
deleted file mode 100644
index 10d4fa2be0a70..0000000000000
--- a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=structurizecfg < %s | FileCheck %s
-
-
-%pair = type { i32, i32 }
-define void @test_if_then_else(ptr %ptr, i1 %cond) {
-; CHECK-LABEL: define void @test_if_then_else(
-; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
-; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
-; CHECK-NEXT:    br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[SUM_ELSE:%.*]], %[[ELSE]] ], [ [[A_THEN]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
-; CHECK-NEXT:    [[SUM_ELSE]] = add i32 [[A_ELSE]], 1
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[PTR]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %load_then = load %pair, ptr %ptr
-  br i1 %cond, label %then, label %else
-
-then:
-  %a_then = extractvalue %pair %load_then, 0
-  br label %merge
-
-else:
-  %a_else = extractvalue %pair %load_then, 0
-  %sum_else = add i32 %a_else, 1
-  br label %merge
-
-merge:
-  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
-  store i32 %phi, ptr  %ptr
-  ret void
-}
-
-define void @test_if_else_then(ptr %ptr, i1 %cond) {
-; CHECK-LABEL: define void @test_if_else_then(
-; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
-; CHECK-NEXT:    br i1 [[COND_INV]], label %[[THEN:.*]], label %[[FLOW:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[A_THEN]], %[[THEN]] ], [ poison, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[THEN]] ], [ true, %[[ENTRY]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[ELSE:.*]], label %[[MERGE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
-; CHECK-NEXT:    [[SUM_ELSE:%.*]] = add i32 [[A_ELSE]], 1
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[SUM_ELSE]], %[[ELSE]] ]
-; CHECK-NEXT:    store i32 [[PHI]], ptr [[PTR]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %load_then = load %pair, ptr %ptr
-  br i1 %cond, label %else, label %then
-
-then:
-  %a_then = extractvalue %pair %load_then, 0
-  br label %merge
-
-else:
-  %a_else = extractvalue %pair %load_then, 0
-  %sum_else = add i32 %a_else, 1
-  br label %merge
-
-merge:
-  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
-  store i32 %phi, ptr  %ptr
-  ret void
-}
-
-define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @test_loop_with_if(
-; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[I3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I15:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
-; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD]], 0
-; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[FLOW:.*]]
-; CHECK:       [[IF]]:
-; CHECK-NEXT:    [[I9:%.*]] = icmp sle i32 [[I3]], 10
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[FLOW1:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[Y:%.*]], %[[ELSE:.*]] ], [ [[A_THEN]], %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ [[TMP2:%.*]], %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    store i32 [[I]], ptr [[PTR]], align 4
-; CHECK-NEXT:    br label %[[LATCH]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP2]] = phi i1 [ true, %[[IF]] ], [ false, %[[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[I9]], %[[IF]] ], [ [[COND_INV]], %[[LOOP]] ]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[ELSE]], label %[[FLOW1]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[I2:%.*]] = extractvalue [[PAIR]] [[LOAD]], 1
-; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD]], 0
-; CHECK-NEXT:    [[Y]] = add i32 [[A_ELSE]], [[I2]]
-; CHECK-NEXT:    br label %[[FLOW1]]
-; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[PTR]], align 4
-; CHECK-NEXT:    [[I15]] = add nsw i32 [[TMP0]], 20
-; CHECK-NEXT:    [[I16:%.*]] = icmp sge i32 [[I15]], 255
-; CHECK-NEXT:    br i1 [[I16]], label %[[END:.*]], label %[[LOOP]]
-; CHECK:       [[END]]:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %a = tail call i32 @llvm.amdgcn.workitem.id.x()
-  br label %loop
-
-loop:
-  %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
-  %load = load %pair, ptr %ptr
-  br i1 %cond, label %if, label %else
-
-if:
-  %cmp = icmp sgt i32 %entry_phi, 10
-  br i1 %cmp, label %then, label %else
-
-then:
-  %a_then = extractvalue %pair %load, 0
-  store i32 %a, ptr %ptr, align 4
-  br label %latch
-
-else:
-  %a2 = extractvalue %pair %load, 1
-  %y = extractvalue %pair %load, 0
-  %a_else = add i32 %y, %a2
-  br label %latch
-
-latch:
-  %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
-  store i32 %a_test, ptr  %ptr
-  %a15 = add nsw i32 %a_test, 20
-  %a16 = icmp slt i32  %a15, 255
-  br i1 %a16, label %loop, label %end
-
-end:
-  ret void
-}
diff --git a/llvm/test/tools/dxil-dis/lifetimes.ll b/llvm/test/tools/dxil-dis/lifetimes.ll
new file mode 100644
index 0000000000000..cb3e6291c7bc0
--- /dev/null
+++ b/llvm/test/tools/dxil-dis/lifetimes.ll
@@ -0,0 +1,38 @@
+; RUN: llc --filetype=obj %s -o - | dxil-dis -o - | FileCheck %s
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+define void @test_lifetimes()  {
+; CHECK-LABEL: test_lifetimes
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x i32], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr [2 x i32], [2 x i32]* [[ALLOCA]], i32 0, i32 0
+; CHECK-NEXT: [[BITCAST:%.*]] = bitcast [2 x i32]* [[ALLOCA]] to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start(i64 4, i8* nonnull [[BITCAST]])
+; CHECK-NEXT: store i32 0, i32* [[GEP]], align 4
+; CHECK-NEXT: [[BITCAST:%.*]] = bitcast [2 x i32]* [[ALLOCA]] to i8*
+; CHECK-NEXT: call void @llvm.lifetime.end(i64 4, i8* nonnull [[BITCAST]])
+; CHECK-NEXT: ret void
+;
+  %a = alloca [2 x i32], align 4
+  %gep = getelementptr [2 x i32], ptr %a, i32 0, i32 0
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %a)
+  store i32 0, ptr %gep, align 4
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %a)
+  ret void
+}
+
+; CHECK-DAG: attributes [[LIFETIME_ATTRS:#.*]] = { nounwind }
+
+; CHECK-DAG: ; Function Attrs: nounwind
+; CHECK-DAG: declare void @llvm.lifetime.start(i64, i8* nocapture) [[LIFETIME_ATTRS]]
+
+; CHECK-DAG: ; Function Attrs: nounwind
+; CHECK-DAG: declare void @llvm.lifetime.end(i64, i8* nocapture) [[LIFETIME_ATTRS]]
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64, ptr) #0
+
+; Function Attrs: nounwind memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64, ptr) #0
+
+attributes #0 = { nounwind memory(argmem: readwrite) }
+
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 8c6f6436782e9..e73f69743805c 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -235,6 +235,53 @@ int llvm_test_dibuilder(void) {
       M, "LargeEnumTest",
       LLVMMetadataAsValue(LLVMGetModuleContext(M), LargeEnumTest));
 
+  LLVMValueRef FooVal3 = LLVMConstInt(LLVMInt64Type(), 8, false);
+  LLVMValueRef FooVal4 = LLVMConstInt(LLVMInt64Type(), 4, false);
+  LLVMMetadataRef lo = LLVMValueAsMetadata(FooVal1);
+  LLVMMetadataRef hi = LLVMValueAsMetadata(FooVal2);
+  LLVMMetadataRef strd = LLVMValueAsMetadata(FooVal3);
+  LLVMMetadataRef bias = LLVMValueAsMetadata(FooVal4);
+  LLVMMetadataRef SubrangeMetadataTy = LLVMDIBuilderCreateSubrangeType(
+      DIB, File, "foo", 3, 42, File, 64, 0, 0, Int64Ty, lo, hi, strd, bias);
+  LLVMAddNamedMetadataOperand(
+      M, "SubrangeType",
+      LLVMMetadataAsValue(LLVMGetModuleContext(M), SubrangeMetadataTy));
+
+  LLVMMetadataRef SetMetadataTy1 = LLVMDIBuilderCreateSetType(
+      DIB, File, "enumset", 7, File, 42, 64, 0, EnumTest);
+  LLVMMetadataRef SetMetadataTy2 = LLVMDIBuilderCreateSetType(
+      DIB, File, "subrangeset", 11, File, 42, 64, 0, SubrangeMetadataTy);
+  LLVMAddNamedMetadataOperand(
+      M, "SetType1",
+      LLVMMetadataAsValue(LLVMGetModuleContext(M), SetMetadataTy1));
+  LLVMAddNamedMetadataOperand(
+      M, "SetType2",
+      LLVMMetadataAsValue(LLVMGetModuleContext(M), SetMetadataTy2));
+
+  LLVMMetadataRef DynSubscripts[] = {
+      LLVMDIBuilderGetOrCreateSubrange(DIB, 0, 10),
+  };
+  LLVMMetadataRef Loc = LLVMDIBuilderCreateExpression(DIB, NULL, 0);
+  LLVMMetadataRef Rank = LLVMDIBuilderCreateExpression(DIB, NULL, 0);
+  LLVMMetadataRef DynamicArrayMetadataTy = LLVMDIBuilderCreateDynamicArrayType(
+      DIB, File, "foo", 3, 42, File, 64 * 10, 0, Int64Ty, DynSubscripts, 1, Loc,
+      FooVar1, NULL, Rank, NULL);
+  LLVMAddNamedMetadataOperand(
+      M, "DynType",
+      LLVMMetadataAsValue(LLVMGetModuleContext(M), DynamicArrayMetadataTy));
+
+  LLVMMetadataRef StructPTy = LLVMDIBuilderCreateForwardDecl(
+      DIB, 2 /*DW_TAG_class_type*/, "Class1", 5, NameSpace, File, 0, 0, 192, 0,
+      "FooClass", 8);
+
+  LLVMMetadataRef Int32Ty =
+      LLVMDIBuilderCreateBasicType(DIB, "Int32", 5, 32, 0, LLVMDIFlagZero);
+  LLVMMetadataRef StructElts[] = {Int64Ty, Int64Ty, Int32Ty};
+  LLVMMetadataRef ClassArr = LLVMDIBuilderGetOrCreateArray(DIB, StructElts, 3);
+  LLVMReplaceArrays(DIB, &StructPTy, &ClassArr, 1);
+  LLVMAddNamedMetadataOperand(
+      M, "ClassType", LLVMMetadataAsValue(LLVMGetModuleContext(M), StructPTy));
+
   // Using the new debug format, debug records get attached to instructions.
   // Insert a `br` and `ret` now to absorb the debug records which are
   // currently "trailing", meaning that they're associated with a block
diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp
index f92b2b55fa8dc..a705e7d51d874 100644
--- a/llvm/tools/llvm-dwarfdump/Statistics.cpp
+++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp
@@ -878,7 +878,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx,
   DenseSet<LineTuple> UniqueLines;
   DenseSet<LineTuple> UniqueNonZeroLines;
 
-  for (const auto &CU : static_cast<DWARFContext *>(&DICtx)->compile_units()) {
+  for (const auto &CU : DICtx.compile_units()) {
     if (DWARFDie CUDie = CU->getNonSkeletonUnitDIE(false)) {
       // This variable holds variable information for functions with
       // abstract_origin, but just for the current CU.
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index d3c613ee823ba..1a535ede07096 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -203,7 +203,7 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
       W.getOStream() << ' ';
       for (i = 0; TmpSecPtr < SecEnd && i < 4; ++i) {
         for (k = 0; TmpSecPtr < SecEnd && k < 4; k++, TmpSecPtr++) {
-          uint8_t Val = *(reinterpret_cast<const uint8_t *>(TmpSecPtr));
+          uint8_t Val = *TmpSecPtr;
           W.getOStream() << format_hex_no_prefix(Val, 2);
         }
         W.getOStream() << ' ';
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index c95f96c4bb3c6..bdfbc8557859a 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -15,6 +15,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <map>
+#include <optional>
 #include <set>
 #include <utility>
 #include <variant>
@@ -86,6 +87,10 @@ struct CtorTesterMapInfo {
 CtorTester getTestKey(int i, CtorTester *) { return CtorTester(i); }
 CtorTester getTestValue(int i, CtorTester *) { return CtorTester(42 + i); }
 
+std::optional<uint32_t> getTestKey(int i, std::optional<uint32_t> *) {
+  return i;
+}
+
 // Test fixture, with helper functions implemented by forwarding to global
 // function overloads selected by component types of the type parameter. This
 // allows all of the map implementations to be tested with shared
@@ -117,11 +122,13 @@ typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
                          DenseMap<uint32_t *, uint32_t *>,
                          DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
                          DenseMap<EnumClass, uint32_t>,
+                         DenseMap<std::optional<uint32_t>, uint32_t>,
                          SmallDenseMap<uint32_t, uint32_t>,
                          SmallDenseMap<uint32_t *, uint32_t *>,
                          SmallDenseMap<CtorTester, CtorTester, 4,
                                        CtorTesterMapInfo>,
-                         SmallDenseMap<EnumClass, uint32_t>
+                         SmallDenseMap<EnumClass, uint32_t>,
+                         SmallDenseMap<std::optional<uint32_t>, uint32_t>
                          > DenseMapTestTypes;
 // clang-format on
 
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 9b88e423e802b..678960418d7d7 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1738,4 +1738,34 @@ TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering2) {
   SE.getAddExpr(Ops);
 }
 
+TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering3) {
+  Type *Int64Ty = Type::getInt64Ty(Context);
+  Constant *Init = Constant::getNullValue(Int64Ty);
+  Type *PtrTy = PointerType::get(Context, 0);
+  Constant *Null = Constant::getNullValue(PtrTy);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Context), {}, false);
+
+  Value *V0 = new GlobalVariable(M, Int64Ty, false,
+                                 GlobalValue::ExternalLinkage, Init, "V0");
+  Value *V1 = new GlobalVariable(M, Int64Ty, false,
+                                 GlobalValue::ExternalLinkage, Init, "V1");
+  Value *V2 = new GlobalVariable(M, Int64Ty, false,
+                                 GlobalValue::InternalLinkage, Init, "V2");
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
+  Value *C0 = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, V0, Null,
+                               "c0", BB);
+  Value *C1 = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, V1, Null,
+                               "c1", BB);
+  Value *C2 = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, V2, Null,
+                               "c2", BB);
+  Value *Or0 = BinaryOperator::CreateOr(C0, C1, "or0", BB);
+  Value *Or1 = BinaryOperator::CreateOr(Or0, C2, "or1", BB);
+  ReturnInst::Create(Context, nullptr, BB);
+  ScalarEvolution SE = buildSE(*F);
+  // When _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG, this will
+  // crash if the comparator is inconsistent about global variable linkage.
+  SE.getSCEV(Or1);
+}
+
 }  // end namespace llvm
diff --git a/llvm/unittests/Support/EndianTest.cpp b/llvm/unittests/Support/EndianTest.cpp
index bba1a56168f70..59281c0ed5444 100644
--- a/llvm/unittests/Support/EndianTest.cpp
+++ b/llvm/unittests/Support/EndianTest.cpp
@@ -237,6 +237,7 @@ TEST(Endian, PackedEndianSpecificIntegral) {
     reinterpret_cast<little32_t *>(little + 1);
 
   EXPECT_EQ(*big_val, *little_val);
+  EXPECT_EQ(big_val->value(), little_val->value());
 }
 
 TEST(Endian, PacketEndianSpecificIntegralAsEnum) {
diff --git a/llvm/utils/TableGen/SearchableTableEmitter.cpp b/llvm/utils/TableGen/SearchableTableEmitter.cpp
index 7d57297eb7c0b..d17d90b452bd7 100644
--- a/llvm/utils/TableGen/SearchableTableEmitter.cpp
+++ b/llvm/utils/TableGen/SearchableTableEmitter.cpp
@@ -32,9 +32,9 @@ using namespace llvm;
 #define DEBUG_TYPE "searchable-table-emitter"
 
 static int64_t getAsInt(const Init *B) {
-  if (const BitsInit *BI = dyn_cast<BitsInit>(B))
+  if (const auto *BI = dyn_cast<BitsInit>(B))
     return *BI->convertInitializerToInt();
-  if (const IntInit *II = dyn_cast<IntInit>(B))
+  if (const auto *II = dyn_cast<IntInit>(B))
     return II->getValue();
   llvm_unreachable("Unexpected initializer");
 }
@@ -126,20 +126,21 @@ class SearchableTableEmitter {
 
   std::string primaryRepresentation(SMLoc Loc, const GenericField &Field,
                                     const Init *I) {
-    if (const StringInit *SI = dyn_cast<StringInit>(I)) {
+    if (const auto *SI = dyn_cast<StringInit>(I)) {
       if (Field.IsCode || SI->hasCodeFormat())
         return SI->getValue().str();
       else
         return SI->getAsString();
-    } else if (const BitsInit *BI = dyn_cast<BitsInit>(I))
+    }
+    if (const auto *BI = dyn_cast<BitsInit>(I))
       return "0x" + utohexstr(getAsInt(BI));
-    else if (const BitInit *BI = dyn_cast<BitInit>(I))
+    if (const auto *BI = dyn_cast<BitInit>(I))
       return BI->getValue() ? "true" : "false";
-    else if (Field.IsIntrinsic)
+    if (Field.IsIntrinsic)
       return "Intrinsic::" + getIntrinsic(I).EnumName.str();
-    else if (Field.IsInstruction)
+    if (Field.IsInstruction)
       return I->getAsString();
-    else if (Field.Enum) {
+    if (Field.Enum) {
       const GenericEnum::Entry *Entry =
           Field.Enum->getEntry(cast<DefInit>(I)->getDef());
       if (!Entry)
@@ -152,7 +153,7 @@ class SearchableTableEmitter {
   }
 
   bool isIntrinsic(const Init *I) {
-    if (const DefInit *DI = dyn_cast<DefInit>(I))
+    if (const auto *DI = dyn_cast<DefInit>(I))
       return DI->getDef()->isSubClassOf("Intrinsic");
     return false;
   }
@@ -174,7 +175,8 @@ class SearchableTableEmitter {
       if (Ctx == TypeInTempStruct)
         return "std::string";
       return "StringRef";
-    } else if (const BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
+    }
+    if (const auto *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
       unsigned NumBits = BI->getNumBits();
       if (NumBits <= 8)
         return "uint8_t";
@@ -188,9 +190,10 @@ class SearchableTableEmitter {
                                      "' lookup method '" + Index.Name +
                                      "', key field '" + Field.Name +
                                      "' of type bits is too large");
-    } else if (isa<BitRecTy>(Field.RecType)) {
+    }
+    if (isa<BitRecTy>(Field.RecType))
       return "bool";
-    } else if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
+    if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
       return "unsigned";
     PrintFatalError(Index.Loc,
                     Twine("In table '") + Table.Name + "' lookup method '" +
@@ -244,67 +247,81 @@ int64_t SearchableTableEmitter::getNumericKey(const SearchIndex &Index,
 /// key of \p Index.
 bool SearchableTableEmitter::compareBy(const Record *LHS, const Record *RHS,
                                        const SearchIndex &Index) {
-  for (const auto &Field : Index.Fields) {
-    const Init *LHSI = LHS->getValueInit(Field.Name);
-    const Init *RHSI = RHS->getValueInit(Field.Name);
+  // Compare two values and return:
+  // * -1 if LHS < RHS.
+  // *  1 if LHS > RHS.
+  // *  0 if LHS == RHS.
+  auto CmpLTValue = [](const auto &LHS, const auto &RHS) -> int {
+    if (LHS < RHS)
+      return -1;
+    if (LHS > RHS)
+      return 1;
+    return 0;
+  };
+
+  // Specialized form of `CmpLTValue` for string-like types that uses compare()
+  // to do the comparison of the 2 strings once (instead if 2 comparisons if we
+  // use `CmpLTValue`).
+  auto CmpLTString = [](StringRef LHS, StringRef RHS) -> int {
+    return LHS.compare(RHS);
+  };
 
+  // Compare two fields and returns:
+  // - true if LHS < RHS.
+  // - false if  LHS > RHS.
+  // - std::nullopt if LHS == RHS.
+  auto CmpLTField = [this, &Index, &CmpLTValue,
+                     &CmpLTString](const Init *LHSI, const Init *RHSI,
+                                   const GenericField &Field) -> int {
     if (isa<BitsRecTy>(Field.RecType) || isa<IntRecTy>(Field.RecType)) {
       int64_t LHSi = getAsInt(LHSI);
       int64_t RHSi = getAsInt(RHSI);
-      if (LHSi < RHSi)
-        return true;
-      if (LHSi > RHSi)
-        return false;
-    } else if (Field.IsIntrinsic) {
+      return CmpLTValue(LHSi, RHSi);
+    }
+
+    if (Field.IsIntrinsic) {
       const CodeGenIntrinsic &LHSi = getIntrinsic(LHSI);
       const CodeGenIntrinsic &RHSi = getIntrinsic(RHSI);
-      if (std::tie(LHSi.TargetPrefix, LHSi.Name) <
-          std::tie(RHSi.TargetPrefix, RHSi.Name))
-        return true;
-      if (std::tie(LHSi.TargetPrefix, LHSi.Name) >
-          std::tie(RHSi.TargetPrefix, RHSi.Name))
-        return false;
-    } else if (Field.IsInstruction) {
+      if (int Cmp = CmpLTString(LHSi.TargetPrefix, RHSi.TargetPrefix))
+        return Cmp;
+      return CmpLTString(LHSi.Name, RHSi.Name);
+    }
+
+    if (Field.IsInstruction) {
       // This does not correctly compare the predefined instructions!
       const Record *LHSr = cast<DefInit>(LHSI)->getDef();
       const Record *RHSr = cast<DefInit>(RHSI)->getDef();
 
-      bool LHSpseudo = LHSr->getValueAsBit("isPseudo");
-      bool RHSpseudo = RHSr->getValueAsBit("isPseudo");
-      if (LHSpseudo && !RHSpseudo)
-        return true;
-      if (!LHSpseudo && RHSpseudo)
-        return false;
+      // Order pseudo instructions before non-pseudo ones.
+      bool LHSNotPseudo = !LHSr->getValueAsBit("isPseudo");
+      bool RHSNotPseudo = !RHSr->getValueAsBit("isPseudo");
+      if (int Cmp = CmpLTValue(LHSNotPseudo, RHSNotPseudo))
+        return Cmp;
+      return CmpLTString(LHSr->getName(), RHSr->getName());
+    }
 
-      int comp = LHSr->getName().compare(RHSr->getName());
-      if (comp < 0)
-        return true;
-      if (comp > 0)
-        return false;
-    } else if (Field.Enum) {
-      auto LHSr = cast<DefInit>(LHSI)->getDef();
-      auto RHSr = cast<DefInit>(RHSI)->getDef();
+    if (Field.Enum) {
+      const Record *LHSr = cast<DefInit>(LHSI)->getDef();
+      const Record *RHSr = cast<DefInit>(RHSI)->getDef();
       int64_t LHSv = Field.Enum->getEntry(LHSr)->Value;
       int64_t RHSv = Field.Enum->getEntry(RHSr)->Value;
-      if (LHSv < RHSv)
-        return true;
-      if (LHSv > RHSv)
-        return false;
-    } else {
-      std::string LHSs = primaryRepresentation(Index.Loc, Field, LHSI);
-      std::string RHSs = primaryRepresentation(Index.Loc, Field, RHSI);
-
-      if (isa<StringRecTy>(Field.RecType)) {
-        LHSs = StringRef(LHSs).upper();
-        RHSs = StringRef(RHSs).upper();
-      }
+      return CmpLTValue(LHSv, RHSv);
+    }
 
-      int comp = LHSs.compare(RHSs);
-      if (comp < 0)
-        return true;
-      if (comp > 0)
-        return false;
+    std::string LHSs = primaryRepresentation(Index.Loc, Field, LHSI);
+    std::string RHSs = primaryRepresentation(Index.Loc, Field, RHSI);
+    if (isa<StringRecTy>(Field.RecType)) {
+      LHSs = StringRef(LHSs).upper();
+      RHSs = StringRef(RHSs).upper();
     }
+    return CmpLTString(LHSs, RHSs);
+  };
+
+  for (const GenericField &Field : Index.Fields) {
+    const Init *LHSI = LHS->getValueInit(Field.Name);
+    const Init *RHSI = RHS->getValueInit(Field.Name);
+    if (int Cmp = CmpLTField(LHSI, RHSI, Field))
+      return Cmp < 0;
   }
   return false;
 }
@@ -359,8 +376,8 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
 
     std::vector<std::pair<const Record *, unsigned>> Entries;
     Entries.reserve(Table.Entries.size());
-    for (unsigned i = 0; i < Table.Entries.size(); ++i)
-      Entries.emplace_back(Table.Entries[i], i);
+    for (auto [Idx, TblEntry] : enumerate(Table.Entries))
+      Entries.emplace_back(TblEntry, Idx);
 
     llvm::stable_sort(Entries,
                       [&](const std::pair<const Record *, unsigned> &LHS,
@@ -369,19 +386,19 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
                       });
 
     IndexRowsStorage.reserve(Entries.size());
-    for (const auto &Entry : Entries) {
-      IndexRowsStorage.push_back(Entry.first);
+    for (const auto &[EntryRec, EntryIndex] : Entries) {
+      IndexRowsStorage.push_back(EntryRec);
 
       OS << "    { ";
       ListSeparator LS;
       for (const auto &Field : Index.Fields) {
         std::string Repr = primaryRepresentation(
-            Index.Loc, Field, Entry.first->getValueInit(Field.Name));
+            Index.Loc, Field, EntryRec->getValueInit(Field.Name));
         if (isa<StringRecTy>(Field.RecType))
           Repr = StringRef(Repr).upper();
         OS << LS << Repr;
       }
-      OS << ", " << Entry.second << " },\n";
+      OS << ", " << EntryIndex << " },\n";
     }
 
     OS << "  };\n\n";
@@ -398,8 +415,8 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
        Index.Fields[0].IsInstruction)) {
     int64_t FirstKeyVal = getNumericKey(Index, IndexRows[0]);
     IsContiguous = true;
-    for (unsigned i = 0; i < IndexRows.size(); ++i) {
-      if (getNumericKey(Index, IndexRows[i]) != (FirstKeyVal + i)) {
+    for (const auto &[Idx, IndexRow] : enumerate(IndexRows)) {
+      if (getNumericKey(Index, IndexRow) != FirstKeyVal + (int64_t)Idx) {
         IsContiguous = false;
         break;
       }
@@ -509,9 +526,9 @@ void SearchableTableEmitter::emitLookupFunction(const GenericTable &Table,
       OS << " ||\n      Key." << Field.Name << " != Idx->" << Field.Name;
   }
 
-  if (ShouldReturnRange)
+  if (ShouldReturnRange) {
     OS << "  return llvm::make_range(It.first, It.second);\n";
-  else if (IsPrimary) {
+  } else if (IsPrimary) {
     OS << ")\n    return nullptr;\n\n";
     OS << "  return &*Idx;\n";
   } else {
@@ -557,8 +574,7 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
 
   // The primary data table contains all the fields defined for this map.
   OS << "constexpr " << Table.CppTypeName << " " << Table.Name << "[] = {\n";
-  for (unsigned i = 0; i < Table.Entries.size(); ++i) {
-    const Record *Entry = Table.Entries[i];
+  for (const auto &[Idx, Entry] : enumerate(Table.Entries)) {
     OS << "  { ";
 
     ListSeparator LS;
@@ -567,7 +583,7 @@ void SearchableTableEmitter::emitGenericTable(const GenericTable &Table,
          << primaryRepresentation(Table.Locs[0], Field,
                                   Entry->getValueInit(Field.Name));
 
-    OS << " }, // " << i << "\n";
+    OS << " }, // " << Idx << "\n";
   }
   OS << " };\n";
 
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index eb8aef259bfd2..b8c8585a33a9b 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -27,6 +27,7 @@ static_library("Analysis") {
     "FixitUtil.cpp",
     "IntervalPartition.cpp",
     "IssueHash.cpp",
+    "LifetimeSafety.cpp",
     "LiveVariables.cpp",
     "MacroExpansionContext.cpp",
     "ObjCNoReturn.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 2e7aa45f38e3e..44f5fdc20837c 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -78,14 +78,12 @@ static_library("builtins") {
       cflags += [ "-fomit-frame-pointer" ]
     }
     cflags_c = [ "-std=c11" ]
-    cflags_cc = [ "-nostdinc++" ]
   }
 
   defines = builtins_defines
   sources = builtins_sources
 
   deps = lse_targets
-  include_dirs = [ "//third-party/siphash/include" ]
 }
 
 # Currently unused but necessary to make sync_source_lists_from_cmake.py happy.
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
index bba2a4e891aa6..ba151075c0f9d 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
@@ -429,7 +429,6 @@ if (current_cpu == "arm") {
 if (current_cpu == "arm64") {
   builtins_sources -= [ "fp_mode.c" ]
   builtins_sources += [
-    "aarch64/emupac.cpp",
     "aarch64/fp_mode.c",
     "cpu_model/aarch64.c",
   ]
diff --git a/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn
index 97e4fdf61ec2d..87848075a804e 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn
@@ -46,7 +46,6 @@ if (current_toolchain != host_toolchain) {
       "//compiler-rt/include($host_toolchain)",
       "//compiler-rt/lib/builtins",
       "//compiler-rt/test:lit_common_configured",
-      "//llvm/utils/not($host_toolchain)",
     ]
   }
 }
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 4ad599820ac57..3a7508ab7187e 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -62,5 +62,6 @@ static_library("CPlusPlus") {
     "LibStdcppUniquePointer.cpp",
     "MSVCUndecoratedNameParser.cpp",
     "MsvcStl.cpp",
+    "MsvcStlSmartPointer.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index ad73f51e57eaf..306e4d3f9f6b8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -140,6 +140,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVInsertVSETVLI.cpp",
     "RISCVInsertWriteVXRM.cpp",
     "RISCVInstrInfo.cpp",
+    "RISCVInterleavedAccess.cpp",
     "RISCVLandingPadSetup.cpp",
     "RISCVLateBranchOpt.cpp",
     "RISCVLoadStoreOptimizer.cpp",
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index 353e64b3d013e..ff4269ed7acd2 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -1,3 +1,4 @@
+include(TableGen)
 include(GNUInstallDirs)
 include(LLVMDistributionSupport)
 
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index c9d2a54433736..8a5976e547169 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -80,6 +80,7 @@
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h"
 #include "mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h"
+#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 
 namespace mlir {
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 5a864865adffc..50c67da91a4af 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -1495,4 +1495,13 @@ def ConvertVectorToXeGPU : Pass<"convert-vector-to-xegpu"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// XeVMToLLVM
+//===----------------------------------------------------------------------===//
+
+def ConvertXeVMToLLVMPass : Pass<"convert-xevm-to-llvm"> {
+  let summary = "Convert XeVM to LLVM dialect";
+  let dependentDialects = ["LLVM::LLVMDialect"];
+}
+
 #endif // MLIR_CONVERSION_PASSES
diff --git a/mlir/include/mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h b/mlir/include/mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h
new file mode 100644
index 0000000000000..7ffdbd4307f9e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h
@@ -0,0 +1,27 @@
+//===-- XeVMToLLVM.h - Convert XeVM to LLVM dialect -------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_XEVMTOLLVM_XEVMTOLLVMPASS_H_
+#define MLIR_CONVERSION_XEVMTOLLVM_XEVMTOLLVMPASS_H_
+
+#include <memory>
+
+namespace mlir {
+class DialectRegistry;
+class LLVMTypeConverter;
+class RewritePatternSet;
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTXEVMTOLLVMPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+void populateXeVMToLLVMConversionPatterns(RewritePatternSet &patterns);
+
+void registerConvertXeVMToLLVMInterface(DialectRegistry &registry);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_XEVMTOLLVM_XEVMTOLLVMPASS_H_
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 3766eb16e9429..187ac9aa18aac 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -258,4 +258,38 @@ def GpuSPIRVAttachTarget: Pass<"spirv-attach-target", ""> {
   ];
 }
 
+def GpuXeVMAttachTarget : Pass<"xevm-attach-target", ""> {
+  let summary = "Attaches a XeVM target attribute to a GPU Module.";
+  let description = [{
+    This pass searches for all GPU Modules in the immediate regions and attaches
+    a XeVM target if the module matches the name specified by the `module` argument.
+
+    Example:
+    ```
+    // File: in.mlir:
+    gpu.module @nvvm_module_1 {...}
+    gpu.module @rocdl_module_2 {...}
+    gpu.module @xevm_module_3 {...}
+    // mlir-opt --xevm-attach-target="module=xevm.* chip=pvc" in.mlir
+    gpu.module @nvvm_module_1 {...}
+    gpu.module @rocdl_module_2 {...}
+    gpu.module @xevm_module_3 [#xevm.target<chip = "pvc">] {...}
+    ```
+  }];
+  let options =
+      [Option<"moduleMatcher", "module", "std::string",
+              /*default=*/[{""}],
+              "Regex used to identify the modules to attach the target to.">,
+       Option<"triple", "triple", "std::string",
+              /*default=*/"\"spirv64-unknown-unknown\"", "Target triple.">,
+       Option<"chip", "chip", "std::string",
+              /*default=*/"\"bmg\"", "Target chip.">,
+       Option<"optLevel", "O", "unsigned",
+              /*default=*/"2", "Optimization level.">,
+       ListOption<"linkLibs", "l", "std::string",
+                  "Extra bitcode libraries paths to link to.">,
+       Option<"cmdOptions", "cmd-options", "std::string",
+              /*default=*/[{""}],
+              "Command line options passed to downstream compiler">];
+}
 #endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 6895e946b8a45..45a8904375e2b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -153,14 +153,20 @@ class NVVM_IntrOp<string mnem, list<Trait> traits = [],
 // NVVM special register op definitions
 //===----------------------------------------------------------------------===//
 
-class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
+class NVVM_PureSpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
   NVVM_IntrOp<mnemonic, !listconcat(traits, [Pure]), 1> {
   let arguments = (ins);
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
-class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
-  NVVM_SpecialRegisterOp<mnemonic,
+class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
+  NVVM_IntrOp<mnemonic, traits, 1> {
+  let arguments = (ins);
+  let assemblyFormat = "attr-dict `:` type($res)";
+}
+
+class NVVM_PureSpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
+  NVVM_PureSpecialRegisterOp<mnemonic,
     !listconcat(traits,
       [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>])> {
   let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
@@ -189,63 +195,63 @@ class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []>
 
 //===----------------------------------------------------------------------===//
 // Lane, Warp, SM, Grid index and range
-def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
-def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
-def NVVM_WarpIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpid">;
-def NVVM_WarpDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nwarpid">;
-def NVVM_SmIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.smid">;
-def NVVM_SmDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nsmid">;
-def NVVM_GridIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.gridid">;
+def NVVM_LaneIdOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_WarpIdOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.warpid">;
+def NVVM_WarpDimOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nwarpid">;
+def NVVM_SmIdOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.smid">;
+def NVVM_SmDimOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nsmid">;
+def NVVM_GridIdOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.gridid">;
 
 //===----------------------------------------------------------------------===//
 // Lane Mask Comparison Ops
-def NVVM_LaneMaskEqOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.eq">;
-def NVVM_LaneMaskLeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.le">;
-def NVVM_LaneMaskLtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.lt">;
-def NVVM_LaneMaskGeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.ge">;
-def NVVM_LaneMaskGtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.gt">;
+def NVVM_LaneMaskEqOp : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.lanemask.eq">;
+def NVVM_LaneMaskLeOp : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.lanemask.le">;
+def NVVM_LaneMaskLtOp : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.lanemask.lt">;
+def NVVM_LaneMaskGeOp : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.lanemask.ge">;
+def NVVM_LaneMaskGtOp : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.lanemask.gt">;
 
 //===----------------------------------------------------------------------===//
 // Thread index and range
-def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
-def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
-def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
-def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
-def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
-def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_ThreadIdXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
 
 //===----------------------------------------------------------------------===//
 // Block index and range
-def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
-def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
-def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
-def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
-def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
-def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
+def NVVM_BlockIdXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA Cluster index and range
-def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x", [NVVMRequiresSM<90>]>;
-def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
-def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
-def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
-def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
-def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
+def NVVM_ClusterIdXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterIdYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
+def NVVM_ClusterIdZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
+def NVVM_ClusterDimXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
+def NVVM_ClusterDimYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
+def NVVM_ClusterDimZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
 
 
 //===----------------------------------------------------------------------===//
 // CTA index and range within Cluster
-def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x", [NVVMRequiresSM<90>]>;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y", [NVVMRequiresSM<90>]>;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z", [NVVMRequiresSM<90>]>;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x", [NVVMRequiresSM<90>]>;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y", [NVVMRequiresSM<90>]>;
-def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_BlockInClusterIdXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x", [NVVMRequiresSM<90>]>;
+def NVVM_BlockInClusterIdYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y", [NVVMRequiresSM<90>]>;
+def NVVM_BlockInClusterIdZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDimBlocksXOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDimBlocksYOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
-def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
+def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
 //===----------------------------------------------------------------------===//
 // Clock registers
@@ -256,7 +262,7 @@ def NVVM_GlobalTimerOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.globaltimer">;
 //===----------------------------------------------------------------------===//
 // envreg registers
 foreach index = !range(0, 32) in {
-  def NVVM_EnvReg # index # Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.envreg" # index>;
+  def NVVM_EnvReg # index # Op : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.envreg" # index>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
index b5e81d595d74c..f457f47d56219 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
@@ -201,25 +201,25 @@ def XeVM_BlockLoad2dOp
   let description = [{
     The `xevm.blockload2d` operation loads a two dimensional matrix tile
     from a base matrix residing in global memory. The parameters are:
-      $ptr - the base address of the base matrix containing the tile to load
-      $base_width - the width of the base matrix in number of bytes.
-      $base_height - the number of rows in the base matrix
-      $base_pitch - the physical stride between the first columns of the current
-         row and the subsequent row in number of bytes.
-      $x, $y, $tile_width, $tile_height - the starting offsets and shape of
-      the tile to load in number of elements.
-      $elem_size_in_bits - the size in bits of the matrix element type
+      * `ptr` - the base address of the base matrix containing the tile to load
+      * `base_width` - the width of the base matrix in number of bytes.
+      * `base_height` - the number of rows in the base matrix
+      * `base_pitch` - the physical stride between the first columns of the current
+        row and the subsequent row in number of bytes.
+      * `x`, `y`, `tile_width`, `tile_height` - the starting offsets and shape of
+        the tile to load in number of elements.
+      * `elem_size_in_bits` - the size in bits of the matrix element type
         - 32 for f32, tf32
         - 16 for f16, int16, bf16
         - 8 for int8
-      $v_blocks - number of consecutive tiles in innermost dimension direction to load
-      $transpose - transpose the tile in registers (useful for 32 bit element type)
-      $pack_register - pack element types narrower than register bit width.
+      * `v_blocks` - number of consecutive tiles in innermost dimension direction to load
+      * `transpose` - transpose the tile in registers (useful for 32 bit element type)
+      * `pack_register` - pack element types narrower than register bit width.
         [M, N] => [M/factor, N, factor] where factor is register_size_in_bits / elem_size_in_bits
-      $cache_control - an enumerator that sets the cache behaviour
+      * `cache_control` - an enumerator that sets the cache behaviour
 
     Notes:
-      - the $transpose and $pack_register parameters are mutual exclusive
+      - the `transpose` and `pack_register` parameters are mutual exclusive
       - transposing the tile loaded is used for A matrix in backward path or used for the B matrix operand
         (D = C + A * B), where A has row-major layout and B should have column-major layout in memory.
       - if the tile loaded contains out of bound elements of the matrix, they are filled with 0.
@@ -262,19 +262,19 @@ def XeVM_BlockStore2dOp
   let description = [{
     The `xevm.blockstore2d` operation stores a two dimensional tile into a
     larger matrix residing in global memory. The parameters are:
-      $ptr - the base address of the target matrix where to store the tile
-      $base_width - the width of the base matrix in number of bytes.
-      $base_height - the number of rows in the base matrix
-      $base_pitch - the physical stride between the first columns of the current
-         row and the subsequent row in number of bytes.
-      $x, $y, $tile_width, $tile_height - the starting offsets and shape of the tile to store
+      * `ptr` - the base address of the target matrix where to store the tile
+      * `base_width` - the width of the base matrix in number of bytes.
+      * `base_height` - the number of rows in the base matrix
+      * `base_pitch` - the physical stride between the first columns of the current
+        row and the subsequent row in number of bytes.
+      * `x`, `y`, `tile_width`, `tile_height` - the starting offsets and shape of the tile to store
       in number of elements.
-      $elem_size_in_bits - the size in bits of the matrix element
+      * `elem_size_in_bits` - the size in bits of the matrix element
         - 32 for f32, tf32
         - 16 for f16, int16, bf16
         - 8 for int8
-      $cache_control - an enumerator that sets the cache behaviour
-      $stored_val - the tile to store
+      * `cache_control` - an enumerator that sets the cache behaviour
+      * `stored_val` - the tile to store
 
     Example:
     ```mlir
@@ -351,10 +351,10 @@ def XeVM_MemfenceOp
     This operation ensures that all prior memory accesses of this
     work-item to `addrspace` are visible to all other work-items in `scope`.
     Parameters description:
-    $scope - specify the memory scope at which all other work-items should observe
-                memory operations prior to the fence.
-    $addrspace - specify the address space of work-item's memory accesses
-                to be affected by the fence.
+      * `scope` - specify the memory scope at which all other work-items should observe
+        memory operations prior to the fence.
+      * `addrspace` - specify the address space of work-item's memory accesses
+        to be affected by the fence.
   }];
   let assemblyFormat = [{prop-dict  attr-dict}];
 
@@ -370,9 +370,9 @@ def XeVM_PrefetchOp
   let summary = "Prefetch data into a cache subsystem.";
   let description = [{
     Work-item issues a prefetch from global memory to cache:
-    $ptr - LLVM pointer with address space. Address space must be 1 (global)
-      or 4 (generic)
-    $cache_control - specify caching options
+      * `ptr` - LLVM pointer with address space. Address space must be 1 (global)
+        or 4 (generic)
+      * `cache_control` - specify caching options
   }];
   let assemblyFormat = [{
     operands prop-dict attr-dict `:` `(` type(operands) `)`
@@ -395,19 +395,19 @@ def XeVM_BlockPrefetch2dOp
   let description = [{
     The `xevm.blockprefetch2d` operation prefetches a two dimensional tile
     from a larger base matrix residing in global memory. The parameters are:
-      $ptr - the base address of the base matrix containing the tile to prefetch
-      $base_width - the width of the base matrix in number of bytes.
-      $base_height - the number of rows in the base matrix
-      $base_pitch - the physical stride between the first columns of the current
-         row and the subsequent row in number of bytes.
-      $x, $y, $tile_width, $tile_height - the starting offsets and shape of tile
-      to prefetch in number of elements.
-      $elem_size_in_bits - the size in bits of the matrix element
-      - 32 for f32, bf32
-      - 16 for f16, int16, bf16
-      - 8 for int8, int4, int2
-      $v_blocks - number of tiles in innermost dimension direction to prefetch
-      $cache_control - an enumerator that sets the cache behaviour
+      * `ptr` - the base address of the base matrix containing the tile to prefetch
+      * `base_width` - the width of the base matrix in number of bytes.
+      * `base_height` - the number of rows in the base matrix
+      * `base_pitch` - the physical stride between the first columns of the current
+        row and the subsequent row in number of bytes.
+      * `x`, `y`, `tile_width`, `tile_height` - the starting offsets and shape of tile
+        to prefetch in number of elements.
+      * `elem_size_in_bits` - the size in bits of the matrix element
+        - 32 for f32, bf32
+        - 16 for f16, int16, bf16
+        - 8 for int8, int4, int2
+      * `v_blocks` - number of tiles in innermost dimension direction to prefetch
+      * `cache_control` - an enumerator that sets the cache behaviour
 
     Example:
     ```mlir
@@ -452,9 +452,9 @@ def XeVM_ElemTypeAttr : I32EnumAttr<"ElemType", "XeVM element type",
 def XeVM_MMAShapeAttr : XeVM_Attr<"MMAShape", "mma_shape"> {
   let description = [{
     MMA operation is represented as D=AxB+C, where
-      A has the shape MxK.
-      B has the shape KxN.
-      D and C have the shape MxN.
+      - A has the shape MxK.
+      - B has the shape KxN.
+      - D and C have the shape MxN.
     This attribute encodes the shape of all matrices that participate in MMA.
   }];
   let parameters = (ins "int":$m, "int":$n, "int":$k);
@@ -484,17 +484,17 @@ def XeVM_MMAOp
       D = C + A x B
 
       where the A, B, C input matrices and the result D have shapes:
-        D : MxN
-        C : MxN
-        A : MxK
-        B : KxN
+        - D : MxN
+        - C : MxN
+        - A : MxK
+        - B : KxN
 
     Parameters:
-      `a` - vector of matrix A elements.
-      `b` - vector of matrix B elements.
-      `c` - (optional) vector of matrix C elements.
-      `shape` - the shape of the matrices, specified as `M`, `N`, and `K` values.
-      `types` - the data types of the matrices, specified as `D`, `A`, `B`, and optionally `C`.
+      * `a` - vector of matrix A elements.
+      * `b` - vector of matrix B elements.
+      * `c` - (optional) vector of matrix C elements.
+      * `shape` - the shape of the matrices, specified as `M`, `N`, and `K` values.
+      * `types` - the data types of the matrices, specified as `D`, `A`, `B`, and optionally `C`.
 
     Example:
     ```mlir
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 9ff2507629856..9123ac34af67d 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -180,6 +180,36 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> {
         return ::mlir::acc::VariableTypeCategory::uncategorized;
       }]
     >,
+      InterfaceMethod<
+      /*description=*/[{
+        Generates the operations that would be normally placed in a recipe's
+        init region. It inserts at the builder's current location.
+        It can be used either to directly "inline" the init region
+        or if the caller sets the insertion point to inside a recipe body,
+        it fills it in. This does not generate the `acc.yield` that normally
+        would terminate a recipe.
+
+        The `extents` are optional and can be empty - it is only when a
+        slice of the private variable needs allocation.
+        The `initVal` can be empty - it is primarily needed for reductions
+        to ensure the variable is also initialized with appropriate value.
+
+        If the return value is empty, it means that recipe body was not
+        successfully generated.
+      }],
+      /*retTy=*/"::mlir::Value",
+      /*methodName=*/"generatePrivateInit",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+                    "::mlir::Location":$loc,
+                    "::mlir::TypedValue<::mlir::acc::MappableType>":$var,
+                    "::llvm::StringRef":$varName,
+                    "::mlir::ValueRange":$extents,
+                    "::mlir::Value":$initVal),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return {};
+      }]
+    >,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
index 46a705eefc262..65771b602e0d0 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
@@ -462,16 +462,19 @@ def SPIRV_DotOp : SPIRV_Op<"Dot",
   }];
 
   let arguments = (ins
-    SPIRV_VectorOf<SPIRV_Float>:$vector1,
-    SPIRV_VectorOf<SPIRV_Float>:$vector2
+    SPIRV_VectorOf<SPIRV_AnyFloat>:$vector1,
+    SPIRV_VectorOf<SPIRV_AnyFloat>:$vector2
   );
 
   let results = (outs
-    SPIRV_Float:$result
+    SPIRV_AnyFloat:$result
   );
 
   let assemblyFormat = "operands attr-dict `:` type($vector1) `->` type($result)";
 
+  // Require dynamic availability specification based on operand/result type.
+  bit autogenAvailability = 0;
+
   let hasVerifier = 0;
 }
 
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
index 3bf0be0a716aa..73f6877c12fab 100644
--- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
+++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
@@ -420,6 +420,62 @@ def DotOp : AVX_LowOp<"dot", [Pure,
   }];
 }
 
+//----------------------------------------------------------------------------//
+// AVX Int8 Dot
+//----------------------------------------------------------------------------//
+
+def DotInt8Op : AVX_Op<"dot.i8", [Pure,
+    X86IntrinsicOpInterface,
+    AllTypesMatch<["a", "b"]>,
+    AllTypesMatch<["w", "dst"]>,
+    TypesMatchWith<"`a` has four times elements as `w`",
+                   "w", "a",
+                   "VectorType::get({::llvm::cast<VectorType>($_self).getShape()[0] * 4}, "
+                   "IntegerType::get($_self.getContext(), 8))">
+  ]> {
+  let summary = "Dot Int8 op";
+  let description = [{
+    The `dot` op is an AVX2-Int8 specific op that can lower to the proper
+    LLVMAVX2-INT8 operation `llvm.vpdpbssd` depending on the width of MLIR
+    vectors it is applied to.
+
+    #### From the Intel Intrinsics Guide:
+
+    Multiply groups of 4 adjacent pairs of signed 8-bit integers in `a` with 
+    corresponding signed 8-bit integers in `b`, producing 4 intermediate signed 16-bit 
+    results. Sum these 4 results with the corresponding 32-bit integer in `w`, and 
+    store the packed 32-bit results in `dst`.
+
+    Example:
+    ```mlir
+    %dst = x86vector.avx.dot.i8 %w, %a, %b : vector<32xi8> -> vector<8xi32>
+    ```
+  }];
+  let arguments = (ins VectorOfLengthAndType<[4, 8], [I32]>:$w,
+                   VectorOfLengthAndType<[16, 32], [I8]>:$a,
+                   VectorOfLengthAndType<[16, 32], [I8]>:$b
+                   );
+  let results = (outs VectorOfLengthAndType<[4, 8], [I32]>:$dst);
+  let assemblyFormat =
+    "$w `,` $a `,` $b attr-dict `:` type($a) `->` type($w)";
+
+  let extraClassDeclaration = [{
+    std::string getIntrinsicName() {
+      std::string intr = "llvm.x86.avx2.vpdpbssd";
+      VectorType vecType = getW().getType();
+      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
+      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
+      intr += "." + std::to_string(opBitWidth);
+      return intr;
+    }
+
+    SmallVector<Value> getIntrinsicOperands(
+        ::mlir::ArrayRef<Value> operands,
+        const ::mlir::LLVMTypeConverter &typeConverter,
+        ::mlir::RewriterBase &rewriter);
+  }];
+}
+
 //----------------------------------------------------------------------------//
 // AVX: Convert BF16/F16 to F32 and broadcast into packed F32
 //----------------------------------------------------------------------------//
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 0f2d0e45008cc..d5a9a2c3aeba7 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -32,6 +32,7 @@
 #include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
 #include "mlir/Dialect/AMX/Transforms.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.h"
@@ -91,6 +92,7 @@ inline void registerAllExtensions(DialectRegistry &registry) {
   gpu::registerConvertGpuToLLVMInterface(registry);
   NVVM::registerConvertGpuToNVVMInterface(registry);
   vector::registerConvertVectorToLLVMInterface(registry);
+  registerConvertXeVMToLLVMInterface(registry);
 
   // Register all transform dialect extensions.
   affine::registerTransformDialectExtension(registry);
diff --git a/mlir/include/mlir/TableGen/Class.h b/mlir/include/mlir/TableGen/Class.h
index f750a34a3b2ba..349ea54954feb 100644
--- a/mlir/include/mlir/TableGen/Class.h
+++ b/mlir/include/mlir/TableGen/Class.h
@@ -332,13 +332,23 @@ class Method : public ClassDeclarationBase<ClassDeclaration::Method> {
       : properties(properties),
         methodSignature(std::forward<RetTypeT>(retType),
                         std::forward<NameT>(name), std::forward<Args>(args)...),
-        methodBody(properties & Declaration) {}
+        methodBody(properties & Declaration) {
+    if (!methodPropertiesAreCompatible(properties)) {
+      llvm::report_fatal_error(
+          "Invalid combination of method properties specified");
+    }
+  }
   /// Create a method with a return type, a name, method properties, and a list
   /// of parameters.
   Method(StringRef retType, StringRef name, Properties properties,
          std::initializer_list<MethodParameter> params)
       : properties(properties), methodSignature(retType, name, params),
-        methodBody(properties & Declaration) {}
+        methodBody(properties & Declaration) {
+    if (!methodPropertiesAreCompatible(properties)) {
+      llvm::report_fatal_error(
+          "Invalid combination of method properties specified");
+    }
+  }
 
   // Define move constructor and assignment operator to prevent copying.
   Method(Method &&) = default;
@@ -402,6 +412,10 @@ class Method : public ClassDeclarationBase<ClassDeclaration::Method> {
   MethodBody methodBody;
   /// Deprecation message if the method is deprecated.
   std::optional<std::string> deprecationMessage;
+
+  /// Utility method to verify method properties correctness.
+  [[maybe_unused]] static bool
+  methodPropertiesAreCompatible(Properties properties);
 };
 
 /// This enum describes C++ inheritance visibility.
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 79e8bb6add0da..5d52cf3f04b6a 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -15,6 +15,7 @@
 #define MLIR_TARGET_LLVMIR_MODULETRANSLATION_H
 
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
@@ -24,6 +25,7 @@
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/FPEnv.h"
 
 namespace llvm {
@@ -108,6 +110,41 @@ class ModuleTranslation {
     return blockMapping.lookup(block);
   }
 
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(omp::NewCliOp mlir) const {
+    llvm::CanonicalLoopInfo *result = loopMapping.lookup(mlir);
+    assert(result && "attempt to get non-existing loop");
+    return result;
+  }
+
+  /// Find the LLVM-IR loop that represents an MLIR loop.
+  llvm::CanonicalLoopInfo *lookupOMPLoop(Value mlir) const {
+    return lookupOMPLoop(mlir.getDefiningOp<omp::NewCliOp>());
+  }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(omp::NewCliOp mlir) { loopMapping.erase(mlir); }
+
+  /// Mark an OpenMP loop as having been consumed.
+  void invalidateOmpLoop(Value mlir) {
+    invalidateOmpLoop(mlir.getDefiningOp<omp::NewCliOp>());
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(omp::NewCliOp mlir, llvm::CanonicalLoopInfo *llvm) {
+    assert(llvm && "argument must be non-null");
+    llvm::CanonicalLoopInfo *&cur = loopMapping[mlir];
+    assert(cur == nullptr && "attempting to map a loop that is already mapped");
+    cur = llvm;
+  }
+
+  /// Map an MLIR OpenMP dialect CanonicalLoopInfo to its lowered LLVM-IR
+  /// OpenMPIRBuilder CanonicalLoopInfo
+  void mapOmpLoop(Value mlir, llvm::CanonicalLoopInfo *llvm) {
+    mapOmpLoop(mlir.getDefiningOp<omp::NewCliOp>(), llvm);
+  }
+
   /// Stores the mapping between an MLIR operation with successors and a
   /// corresponding LLVM IR instruction.
   void mapBranch(Operation *mlir, llvm::Instruction *llvm) {
@@ -381,6 +418,12 @@ class ModuleTranslation {
   DenseMap<Value, llvm::Value *> valueMapping;
   DenseMap<Block *, llvm::BasicBlock *> blockMapping;
 
+  /// List of not yet consumed MLIR loop handles (represented by an omp.new_cli
+  /// operation which creates a value of type CanonicalLoopInfoType) and their
+  /// LLVM-IR representation as CanonicalLoopInfo which is managed by the
+  /// OpenMPIRBuilder.
+  DenseMap<omp::NewCliOp, llvm::CanonicalLoopInfo *> loopMapping;
+
   /// A mapping between MLIR LLVM dialect terminators and LLVM IR terminators
   /// they are converted to. This allows for connecting PHI nodes to the source
   /// values after all operations are converted.
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index e4b4974600577..24a48993ad80c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -73,3 +73,4 @@ add_subdirectory(VectorToLLVM)
 add_subdirectory(VectorToSCF)
 add_subdirectory(VectorToSPIRV)
 add_subdirectory(VectorToXeGPU)
+add_subdirectory(XeVMToLLVM)
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 77a2708653576..7ac9687c4eeda 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -41,6 +41,16 @@ template <typename T>
 struct OpenMPOpConversion : public ConvertOpToLLVMPattern<T> {
   using ConvertOpToLLVMPattern<T>::ConvertOpToLLVMPattern;
 
+  OpenMPOpConversion(LLVMTypeConverter &typeConverter,
+                     PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<T>(typeConverter, benefit) {
+    // Operations using CanonicalLoopInfoType are lowered only by
+    // mlir::translateModuleToLLVMIR() using the OpenMPIRBuilder. Until then,
+    // the type and operations using it must be preserved.
+    typeConverter.addConversion(
+        [&](::mlir::omp::CanonicalLoopInfoType type) { return type; });
+  }
+
   LogicalResult
   matchAndRewrite(T op, typename T::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index b89fde4fbc17e..c1f40dcbd5ca0 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -807,6 +807,7 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
         ValueRange{paddedInput, fakeWindowDims}, filledEmptyTensor, strideAttr,
         dilationAttr);
 
+    rewriter.setInsertionPointAfter(op);
     rewriter.replaceOp(op, resultOp);
 
     // NaN propagation has no meaning for non floating point types.
diff --git a/mlir/lib/Conversion/XeVMToLLVM/CMakeLists.txt b/mlir/lib/Conversion/XeVMToLLVM/CMakeLists.txt
new file mode 100644
index 0000000000000..4ac60d8d43472
--- /dev/null
+++ b/mlir/lib/Conversion/XeVMToLLVM/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_mlir_conversion_library(MLIRXeVMToLLVM
+  XeVMToLLVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/XeVMToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRFuncDialect
+  MLIRGPUDialect
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRXeVMDialect
+  MLIRPass
+  MLIRTransforms
+)
diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
new file mode 100644
index 0000000000000..a8380b9669f0f
--- /dev/null
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -0,0 +1,636 @@
+//===-- XeVMToLLVM.cpp - XeVM to LLVM dialect conversion --------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
+
+#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTXEVMTOLLVMPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace xevm;
+
+namespace {
+
+struct LLVMFuncAttributeOptions {
+  bool isConvergent = false;
+  bool isNoUnwind = false;
+  bool isWillReturn = false;
+  LLVM::MemoryEffectsAttr memEffectsAttr{};
+};
+static constexpr LLVMFuncAttributeOptions noUnwindAttrs = {
+    false, true, false, {}};
+static constexpr LLVMFuncAttributeOptions noUnwindWillReturnAttrs = {
+    false, true, true, {}};
+static constexpr LLVMFuncAttributeOptions convergentNoUnwindWillReturnAttrs = {
+    true, true, true, {}};
+
+std::string getTypeMangling(Type ty, bool isUnsigned = false) {
+  return TypeSwitch<Type, std::string>(ty)
+      .Case([isUnsigned](VectorType ty) -> std::string {
+        return "Dv" + std::to_string(ty.getNumElements()) + "_" +
+               getTypeMangling(ty.getElementType(), isUnsigned);
+      })
+      .Case([](Float16Type) -> std::string { return "Dh"; })
+      .Case([](Float32Type) -> std::string { return "f"; })
+      .Case([](Float64Type) -> std::string { return "d"; })
+      .Case([isUnsigned](IntegerType ty) -> std::string {
+        switch (ty.getWidth()) {
+        case 8:
+          return isUnsigned ? "h" : "c";
+        case 16:
+          return isUnsigned ? "t" : "s";
+        case 32:
+          return isUnsigned ? "j" : "i";
+        case 64:
+          return isUnsigned ? "m" : "l";
+        default:
+          llvm_unreachable("unhandled integer type");
+        }
+      })
+      .Default([](Type) -> std::string {
+        llvm_unreachable("unhandled type for mangling");
+      });
+}
+
+std::string mangle(StringRef baseName, ArrayRef<Type> types,
+                   ArrayRef<bool> isUnsigned = {}) {
+  assert((isUnsigned.empty() || isUnsigned.size() == types.size()) &&
+         "Signedness info doesn't match");
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  llvm::SmallDenseMap<Type, unsigned> substitutions;
+  os << "_Z" << baseName.size() << baseName;
+  for (auto [idx, type] : llvm::enumerate(types)) {
+    auto it = substitutions.find(type);
+    if (it != substitutions.end()) {
+      os << "S";
+      // First substitution is `S_`, second is `S0_`, and so on.
+      if (unsigned firstIdx = it->getSecond(); firstIdx > 0)
+        os << firstIdx - 1;
+      os << "_";
+    } else {
+      if (!type.isIntOrFloat())
+        substitutions[type] = substitutions.size();
+      os << getTypeMangling(type, isUnsigned.empty() ? false : isUnsigned[idx]);
+    }
+  }
+  return os.str();
+}
+
+template <bool isLoad, typename OpType>
+int32_t getL1CacheControl(OpType op) {
+  int32_t control = 0;
+  if constexpr (isLoad) {
+    switch (*op.getCacheControl()) {
+    case LoadCacheControl::L1UC_L2UC_L3UC:
+    case LoadCacheControl::L1UC_L2UC_L3C:
+    case LoadCacheControl::L1UC_L2C_L3UC:
+    case LoadCacheControl::L1UC_L2C_L3C:
+      control = 1;
+      break;
+    case LoadCacheControl::L1C_L2UC_L3UC:
+    case LoadCacheControl::L1C_L2UC_L3C:
+    case LoadCacheControl::L1C_L2C_L3UC:
+    case LoadCacheControl::L1C_L2C_L3C:
+      control = 2;
+      break;
+    case LoadCacheControl::L1S_L2UC_L3UC:
+    case LoadCacheControl::L1S_L2UC_L3C:
+    case LoadCacheControl::L1S_L2C_L3UC:
+    case LoadCacheControl::L1S_L2C_L3C:
+      control = 3;
+      break;
+    case LoadCacheControl::INVALIDATE_READ:
+      control = 4;
+      break;
+    }
+  } else {
+    switch (*op.getCacheControl()) {
+    case StoreCacheControl::L1UC_L2UC_L3UC:
+    case StoreCacheControl::L1UC_L2UC_L3WB:
+    case StoreCacheControl::L1UC_L2WB_L3UC:
+    case StoreCacheControl::L1UC_L2WB_L3WB:
+      control = 1;
+      break;
+    case StoreCacheControl::L1WT_L2UC_L3UC:
+    case StoreCacheControl::L1WT_L2UC_L3WB:
+    case StoreCacheControl::L1WT_L2WB_L3UC:
+    case StoreCacheControl::L1WT_L2WB_L3WB:
+      control = 2;
+      break;
+    case StoreCacheControl::L1S_L2UC_L3UC:
+    case StoreCacheControl::L1S_L2UC_L3WB:
+    case StoreCacheControl::L1S_L2WB_L3UC:
+    case StoreCacheControl::L1S_L2WB_L3WB:
+      control = 3;
+      break;
+    case StoreCacheControl::L1WB_L2UC_L3UC:
+    case StoreCacheControl::L1WB_L2WB_L3UC:
+    case StoreCacheControl::L1WB_L2UC_L3WB:
+      control = 4;
+      break;
+    }
+  }
+  return control;
+}
+
+template <bool isLoad, typename OpType>
+int32_t getL3CacheControl(OpType op) {
+  int32_t control = 0;
+  if constexpr (isLoad) {
+    switch (*op.getCacheControl()) {
+    case LoadCacheControl::L1UC_L2UC_L3UC:
+    case LoadCacheControl::L1UC_L2C_L3UC:
+    case LoadCacheControl::L1C_L2UC_L3UC:
+    case LoadCacheControl::L1C_L2C_L3UC:
+    case LoadCacheControl::L1S_L2UC_L3UC:
+    case LoadCacheControl::L1S_L2C_L3UC:
+      control = 1;
+      break;
+    case LoadCacheControl::L1UC_L2UC_L3C:
+    case LoadCacheControl::L1UC_L2C_L3C:
+    case LoadCacheControl::L1C_L2UC_L3C:
+    case LoadCacheControl::L1C_L2C_L3C:
+    case LoadCacheControl::L1S_L2UC_L3C:
+    case LoadCacheControl::L1S_L2C_L3C:
+      control = 2;
+      break;
+    case LoadCacheControl::INVALIDATE_READ:
+      control = 4;
+      break;
+    }
+  } else {
+    switch (*op.getCacheControl()) {
+    case StoreCacheControl::L1UC_L2UC_L3UC:
+    case StoreCacheControl::L1UC_L2WB_L3UC:
+    case StoreCacheControl::L1WT_L2UC_L3UC:
+    case StoreCacheControl::L1WT_L2WB_L3UC:
+    case StoreCacheControl::L1S_L2UC_L3UC:
+    case StoreCacheControl::L1S_L2WB_L3UC:
+    case StoreCacheControl::L1WB_L2UC_L3UC:
+    case StoreCacheControl::L1WB_L2WB_L3UC:
+      control = 1;
+      break;
+    case StoreCacheControl::L1UC_L2UC_L3WB:
+    case StoreCacheControl::L1UC_L2WB_L3WB:
+    case StoreCacheControl::L1WT_L2UC_L3WB:
+    case StoreCacheControl::L1WT_L2WB_L3WB:
+    case StoreCacheControl::L1S_L2UC_L3WB:
+    case StoreCacheControl::L1S_L2WB_L3WB:
+    case StoreCacheControl::L1WB_L2UC_L3WB:
+      control = 2;
+      break;
+    }
+  }
+  return control;
+}
+
+template <bool isLoad, typename OpType>
+static std::optional<ArrayAttr>
+getCacheControlMetadata(ConversionPatternRewriter &rewriter, OpType op) {
+  if (!op.getCacheControl())
+    return {};
+  constexpr int32_t decorationCacheControlArity{4};
+  constexpr int32_t loadCacheControlKey{6442};
+  constexpr int32_t storeCacheControlKey{6443};
+  const int32_t controlKey{isLoad ? loadCacheControlKey : storeCacheControlKey};
+  SmallVector<int32_t, decorationCacheControlArity> decorationsL1{
+      controlKey, 0, getL1CacheControl<isLoad, OpType>(op), 0};
+  SmallVector<int32_t, decorationCacheControlArity> decorationsL3{
+      controlKey, 1, getL3CacheControl<isLoad, OpType>(op), 0};
+  auto arrayAttrL1 = rewriter.getI32ArrayAttr(decorationsL1);
+  auto arrayAttrL3 = rewriter.getI32ArrayAttr(decorationsL3);
+
+  SmallVector<Attribute, 2> combinedAttrs = {arrayAttrL1, arrayAttrL3};
+  return rewriter.getArrayAttr(combinedAttrs);
+}
+
+static LLVM::CallOp createDeviceFunctionCall(
+    ConversionPatternRewriter &rewriter, StringRef funcName, Type retType,
+    ArrayRef<Type> argTypes, ArrayRef<Value> args,
+    mlir::ArrayRef<std::pair<unsigned, mlir::StringRef>> paramAttrs,
+    LLVMFuncAttributeOptions funcAttributeOptions, Operation *op) {
+  auto moduleOp = op->getParentWithTrait<OpTrait::SymbolTable>();
+  assert(moduleOp && "Expecting module");
+  Location loc = op->getLoc();
+
+  auto funcOpRes =
+      LLVM::lookupOrCreateFn(rewriter, moduleOp, funcName, argTypes, retType);
+  assert(!failed(funcOpRes));
+  LLVM::LLVMFuncOp funcOp = funcOpRes.value();
+  funcOp.setCConv(LLVM::cconv::CConv::SPIR_FUNC);
+  funcOp.setConvergent(funcAttributeOptions.isConvergent);
+  funcOp.setNoUnwind(funcAttributeOptions.isNoUnwind);
+  funcOp.setWillReturn(funcAttributeOptions.isWillReturn);
+
+  if (funcAttributeOptions.memEffectsAttr)
+    funcOp.setMemoryEffectsAttr(funcAttributeOptions.memEffectsAttr);
+
+  for (auto [idx, attrName] : paramAttrs)
+    funcOp.setArgAttr(idx, attrName, rewriter.getUnitAttr());
+
+  auto callOp = rewriter.create<LLVM::CallOp>(loc, funcOp, args);
+  callOp->setAttrs(funcOp->getAttrs());
+
+  return callOp;
+}
+
+class MMAToOCLPattern : public OpConversionPattern<xevm::MMAOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xevm::MMAOp op, xevm::MMAOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!op.getC()) {
+      return rewriter.notifyMatchFailure(op, "OCL requires C operand");
+    }
+    auto precisionA = op.getTypes().getA();
+    auto precisionB = op.getTypes().getB();
+    auto precisionC = op.getTypes().getC();
+    auto precisionD = op.getTypes().getD();
+    if (precisionC != precisionD) {
+      return rewriter.notifyMatchFailure(op, "type of C and D need to match");
+    }
+    if (precisionC != xevm::ElemType::S32 &&
+        precisionC != xevm::ElemType::F32 &&
+        precisionC != xevm::ElemType::F16 &&
+        precisionC != xevm::ElemType::BF16) {
+      return rewriter.notifyMatchFailure(
+          op, "type of C and D must be S32, F32, F16 or BF16");
+    }
+    if (precisionA == xevm::ElemType::S32 ||
+        precisionA == xevm::ElemType::F32) {
+      return rewriter.notifyMatchFailure(op, "type of A cannot be S32 or F32");
+    }
+    if (precisionB == xevm::ElemType::S32 ||
+        precisionB == xevm::ElemType::F32) {
+      return rewriter.notifyMatchFailure(op, "type of B cannot be S32 or F32");
+    }
+    constexpr uint32_t bitWidthPackedA{16};
+    constexpr uint32_t bitWidthPackedB{32};
+    auto loc = op.getLoc();
+
+    auto castIfNeeded = [&](Value val, Type packedType) -> Value {
+      VectorType origTy = cast<VectorType>(val.getType());
+      const uint32_t vecBitSize =
+          origTy.getNumElements() *
+          origTy.getElementType().getIntOrFloatBitWidth();
+      VectorType newTy = VectorType::get(
+          vecBitSize / packedType.getIntOrFloatBitWidth(), packedType);
+      if (origTy != newTy)
+        val = rewriter.create<LLVM::BitcastOp>(loc, newTy, val);
+      return val;
+    };
+
+    Value a = op.getA();
+    Type packedAType = (op.getTypes().getA() == xevm::ElemType::TF32)
+                           ? cast<Type>(rewriter.getF32Type())
+                           : rewriter.getIntegerType(bitWidthPackedA);
+    a = castIfNeeded(a, packedAType);
+
+    Value b = op.getB();
+    Type packedBType = (op.getTypes().getB() == xevm::ElemType::TF32)
+                           ? cast<Type>(rewriter.getF32Type())
+                           : rewriter.getIntegerType(bitWidthPackedB);
+    b = castIfNeeded(b, packedBType);
+
+    Value c = op.getC();
+    VectorType cOrigTy = cast<VectorType>(c.getType());
+    VectorType resOrigTy = cast<VectorType>(op->getResultTypes()[0]);
+    assert(cOrigTy == resOrigTy && "Accumulator and result type mismatch");
+    // OCL builtins encode bfloat16 as int16
+    VectorType cTy =
+        cOrigTy.getElementType().isBF16()
+            ? VectorType::get(cOrigTy.getShape(), rewriter.getIntegerType(16))
+            : cOrigTy;
+    VectorType resTy = cTy;
+    if (cOrigTy != cTy)
+      c = rewriter.create<LLVM::BitcastOp>(loc, cTy, c);
+
+    constexpr int32_t systolicDepth{8};
+    std::string fnName =
+        llvm::formatv("intel_sub_group_{0}_{1}_matrix_mad_k{2}",
+                      stringifyElemType(op.getTypes().getA()).str(),
+                      stringifyElemType(op.getTypes().getB()).str(),
+                      systolicDepth *
+                          getNumOperandsPerDword(op.getTypes().getA()))
+            .str();
+    SmallVector<Type> argTypes{a.getType(), b.getType(), cTy};
+    fnName = mangle(fnName, argTypes);
+    SmallVector<Value> args{a, b, c};
+
+    auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
+        /*other=*/LLVM::ModRefInfo::NoModRef,
+        /*argMem=*/LLVM::ModRefInfo::NoModRef,
+        /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
+    auto funcAttrs = convergentNoUnwindWillReturnAttrs;
+    funcAttrs.memEffectsAttr = memAttr;
+    Value result =
+        createDeviceFunctionCall(rewriter, fnName, resTy, argTypes, args, {},
+                                 funcAttrs, op.getOperation())
+            ->getResult(0);
+
+    if (resOrigTy != resTy)
+      result = rewriter.create<LLVM::BitcastOp>(loc, resOrigTy, result);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  static unsigned getNumOperandsPerDword(xevm::ElemType pTy) {
+    switch (pTy) {
+    case xevm::ElemType::TF32:
+      return 1;
+    case xevm::ElemType::BF16:
+    case xevm::ElemType::F16:
+      return 2;
+    case xevm::ElemType::U8:
+    case xevm::ElemType::S8:
+      return 4;
+    default:
+      llvm_unreachable("unsupported xevm::ElemType");
+    }
+  }
+};
+
+class PrefetchToOCLPattern : public OpConversionPattern<PrefetchOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(PrefetchOp op, PrefetchOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    const std::string fnName{"_Z8prefetchPU3AS1Kcm"};
+    Value one =
+        rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(), 1);
+    SmallVector<Value> args{op.getPtr(), one};
+    SmallVector<Type> argTypes;
+    for (auto arg : args)
+      argTypes.push_back(arg.getType());
+    auto funcAttr = noUnwindAttrs;
+    auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
+        /*other=*/LLVM::ModRefInfo::NoModRef,
+        /*argMem=*/LLVM::ModRefInfo::Ref,
+        /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
+    funcAttr.memEffectsAttr = memAttr;
+
+    LLVM::CallOp call = createDeviceFunctionCall(
+        rewriter, fnName, LLVM::LLVMVoidType::get(rewriter.getContext()),
+        argTypes, args, {}, funcAttr, op.getOperation());
+    if (std::optional<ArrayAttr> optCacheControls =
+            getCacheControlMetadata<true>(rewriter, op))
+      call->setAttr(XeVMDialect::getCacheControlsAttrName(), *optCacheControls);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+class MemfenceToOCLPattern : public OpConversionPattern<MemfenceOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(MemfenceOp op, MemfenceOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    const std::string fnName{"atomic_work_item_fence"};
+    int memScope, addrSpace;
+    switch (op.getAddrspace()) {
+    case xevm::AddrSpace::SHARED:
+      addrSpace = 1; // CLK_LOCAL_MEM_FENCE
+      break;
+    case xevm::AddrSpace::GLOBAL:
+      addrSpace = 2; // CLK_GLOBAL_MEM_FENCE
+      break;
+    default:
+      // GENERIC is not supported in OpenCL
+      return rewriter.notifyMatchFailure(
+          op, "Fence only supports global and shared address spaces.");
+    }
+    switch (op.getScope()) {
+    case xevm::MemScope::WORKGROUP:
+      memScope = 1;
+      break;
+    case xevm::MemScope::DEVICE:
+      memScope = 2;
+      break;
+    default:
+      // CLUSTER and SYSTEM are not supported in OpenCL
+      return rewriter.notifyMatchFailure(
+          op, "Fence only supports workgroup and device memory scopes.");
+    }
+    Type i32Type = rewriter.getI32Type();
+    Value acqRel = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 4);
+    Value memScopeConst =
+        rewriter.create<LLVM::ConstantOp>(loc, i32Type, memScope);
+    Value addrSpaceConst =
+        rewriter.create<LLVM::ConstantOp>(loc, i32Type, addrSpace);
+    SmallVector<Value> args{addrSpaceConst, acqRel, memScopeConst};
+    SmallVector<Type> argTypes{3, i32Type};
+    createDeviceFunctionCall(rewriter, mangle(fnName, argTypes),
+                             LLVM::LLVMVoidType::get(rewriter.getContext()),
+                             argTypes, args, {}, noUnwindAttrs,
+                             op.getOperation());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+template <typename OpType>
+class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    constexpr bool isLoad = std::is_same_v<OpType, BlockLoad2dOp>;
+    constexpr bool isPrefetch = std::is_same_v<OpType, BlockPrefetch2dOp>;
+
+    auto loc = op.getLoc();
+    VectorType vecType;
+    bool packReg = false;
+    bool transpose = false;
+    if constexpr (isLoad) {
+      vecType = op.getRes().getType();
+      packReg = op.getPackRegister();
+      transpose = op.getTranspose();
+    } else if constexpr (!isPrefetch) {
+      vecType = op.getStoredVal().getType();
+    }
+
+    auto i32Type = rewriter.getI32Type();
+    Value byteCoord =
+        rewriter.create<LLVM::UndefOp>(loc, VectorType::get(2, i32Type));
+    Value zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 0);
+    Value one = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 1);
+    byteCoord = rewriter.create<LLVM::InsertElementOp>(
+        loc, VectorType::get(2, i32Type), byteCoord, op.getX(), zero);
+    byteCoord = rewriter.create<LLVM::InsertElementOp>(
+        loc, VectorType::get(2, i32Type), byteCoord, op.getY(), one);
+    SmallVector<Value> args{op.getPtr(), op.getBaseWidth(), op.getBaseHeight(),
+                            op.getBasePitch(), byteCoord};
+    SmallVector<Type> retTypes;
+    Value spvLoadDstPtr;
+    std::string funcName{"intel_sub_group_2d_block_"};
+    std::string bitWidthId;
+    LLVMFuncAttributeOptions funcAttr{noUnwindWillReturnAttrs};
+    SmallVector<std::pair<unsigned, StringRef>, 4> paramAttrs;
+    if constexpr (isPrefetch) { // Prefetch
+      funcName += "prefetch";
+      paramAttrs = {std::make_pair(0, LLVM::LLVMDialect::getNonNullAttrName())};
+      auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
+          /*other=*/LLVM::ModRefInfo::NoModRef,
+          /*argMem=*/LLVM::ModRefInfo::Ref,
+          /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
+      funcAttr = noUnwindAttrs;
+      funcAttr.memEffectsAttr = memAttr;
+    } else {
+      auto vecElemType = vecType.getElementType();
+      auto vecElemBitWidth = vecElemType.getIntOrFloatBitWidth();
+      Value numElems = rewriter.create<LLVM::ConstantOp>(
+          loc, i32Type, vecType.getNumElements());
+      auto dstOrSrcPtr = rewriter.create<LLVM::AllocaOp>(
+          loc, LLVM::LLVMPointerType::get(rewriter.getContext()), vecElemType,
+          numElems);
+      args.push_back(dstOrSrcPtr);
+      if constexpr (isLoad) { // Load
+        funcName += "read";
+        bitWidthId = getTypeMangling(vecElemType, /*isUnsigned=*/true);
+        if (packReg)
+          funcName += "_transform";
+        else if (transpose)
+          funcName += "_transpose";
+        spvLoadDstPtr = dstOrSrcPtr;
+        retTypes.push_back(vecType);
+        paramAttrs = {
+            std::make_pair(0, LLVM::LLVMDialect::getNonNullAttrName()),
+            std::make_pair(0, LLVM::LLVMDialect::getReadonlyAttrName()),
+            std::make_pair(5, LLVM::LLVMDialect::getNonNullAttrName()),
+            std::make_pair(5, LLVM::LLVMDialect::getWriteOnlyAttrName()),
+        };
+      } else { // Store
+        funcName += "write";
+        bitWidthId = (vecElemBitWidth == 32)
+                         ? "j"
+                         : ((vecElemBitWidth == 16) ? "t" : "h");
+        rewriter.create<LLVM::StoreOp>(loc, op.getStoredVal(), dstOrSrcPtr);
+        paramAttrs = {
+            std::make_pair(0, LLVM::LLVMDialect::getNonNullAttrName()),
+            std::make_pair(0, LLVM::LLVMDialect::getWriteOnlyAttrName()),
+            std::make_pair(5, LLVM::LLVMDialect::getNonNullAttrName()),
+            std::make_pair(5, LLVM::LLVMDialect::getReadonlyAttrName()),
+        };
+      }
+    }
+
+    funcName =
+        llvm::formatv("{0}_{1}b_{2}r{3}x{4}c", funcName, op.getElemSizeInBits(),
+                      op.getTileHeight(), op.getTileWidth(), op.getVBlocks())
+            .str();
+    std::string prefetchCode("");
+    if (!isPrefetch)
+      prefetchCode += "P";
+    funcName = llvm::formatv("_Z{0}{1}PU3AS1viiiDv2_i{2}{3}", funcName.size(),
+                             funcName, prefetchCode, bitWidthId)
+                   .str();
+    SmallVector<Type> argTypes;
+    for (auto arg : args) {
+      argTypes.push_back(arg.getType());
+    }
+    LLVM::CallOp call = createDeviceFunctionCall(
+        rewriter, funcName, LLVM::LLVMVoidType::get(rewriter.getContext()),
+        argTypes, args, paramAttrs, funcAttr, op.getOperation());
+    if (std::optional<ArrayAttr> optCacheControls =
+            getCacheControlMetadata < isLoad || isPrefetch > (rewriter, op)) {
+      call->setAttr(XeVMDialect::getCacheControlsAttrName(), *optCacheControls);
+    }
+    if constexpr (isLoad)
+      rewriter.replaceOp(
+          op, rewriter.create<LLVM::LoadOp>(loc, vecType, spvLoadDstPtr));
+    else
+      rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+
+struct ConvertXeVMToLLVMPass
+    : public impl::ConvertXeVMToLLVMPassBase<ConvertXeVMToLLVMPass> {
+  using Base::Base;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, XeVMDialect>();
+  }
+
+  void runOnOperation() override {
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addIllegalDialect<XeVMDialect>();
+    RewritePatternSet patterns(&getContext());
+    populateXeVMToLLVMConversionPatterns(patterns);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// ConvertToLLVMPatternInterface implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Implement the interface to convert XeVM to LLVM.
+struct XeVMToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
+  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
+  void loadDependentDialects(MLIRContext *context) const final {
+    context->loadDialect<LLVM::LLVMDialect>();
+  }
+
+  /// Hook for derived dialect interface to provide conversion patterns
+  /// and mark dialect legal for the conversion target.
+  void populateConvertToLLVMConversionPatterns(
+      ConversionTarget &target, LLVMTypeConverter &typeConverter,
+      RewritePatternSet &patterns) const final {
+    populateXeVMToLLVMConversionPatterns(patterns);
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pattern Population
+//===----------------------------------------------------------------------===//
+
+void ::mlir::populateXeVMToLLVMConversionPatterns(RewritePatternSet &patterns) {
+  patterns.add<LoadStorePrefetchToOCLPattern<BlockLoad2dOp>,
+               LoadStorePrefetchToOCLPattern<BlockStore2dOp>,
+               LoadStorePrefetchToOCLPattern<BlockPrefetch2dOp>,
+               MMAToOCLPattern, MemfenceToOCLPattern, PrefetchToOCLPattern>(
+      patterns.getContext());
+}
+
+void ::mlir::registerConvertXeVMToLLVMInterface(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
+    dialect->addInterfaces<XeVMToLLVMDialectInterface>();
+  });
+}
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 4862d1f722785..f2f010a771b77 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -44,6 +44,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/ShuffleRewriter.cpp
   Transforms/SubgroupIdRewriter.cpp
   Transforms/SubgroupReduceLowering.cpp
+  Transforms/XeVMAttachTarget.cpp
 
   OBJECT
 
@@ -78,6 +79,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   MLIRSupport
   MLIRTransformUtils
   MLIRVectorDialect
+  MLIRXeVMDialect
   )
 
 add_subdirectory(TransformOps)
diff --git a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
new file mode 100644
index 0000000000000..e9cf4939a13b8
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
@@ -0,0 +1,92 @@
+//===-- XeVMAttachTarget.cpp - Attach an XeVM target ----------------------===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the `GpuXeVMAttachTarget` pass, attaching `#xevm.target`
+// attributes to GPU modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/Regex.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_GPUXEVMATTACHTARGET
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::xevm;
+
+namespace {
+struct XeVMAttachTarget
+    : public mlir::impl::GpuXeVMAttachTargetBase<XeVMAttachTarget> {
+  using Base::Base;
+
+  DictionaryAttr getFlags(OpBuilder &builder) const;
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<xevm::XeVMDialect>();
+  }
+};
+} // namespace
+
+DictionaryAttr XeVMAttachTarget::getFlags(OpBuilder &builder) const {
+  SmallVector<NamedAttribute, 3> flags;
+  // Tokenize and set the optional command line options.
+  if (!cmdOptions.empty()) {
+    std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options =
+        gpu::TargetOptions::tokenizeCmdOptions(cmdOptions);
+    if (!options.second.empty()) {
+      llvm::SmallVector<mlir::Attribute> xevmOptionAttrs;
+      for (const char *opt : options.second) {
+        xevmOptionAttrs.emplace_back(
+            mlir::StringAttr::get(builder.getContext(), StringRef(opt)));
+      }
+      flags.push_back(builder.getNamedAttr(
+          "cmd-options",
+          mlir::ArrayAttr::get(builder.getContext(), xevmOptionAttrs)));
+    }
+  }
+
+  if (!flags.empty())
+    return builder.getDictionaryAttr(flags);
+  return nullptr;
+}
+
+void XeVMAttachTarget::runOnOperation() {
+  OpBuilder builder(&getContext());
+  ArrayRef<std::string> libs(linkLibs);
+  SmallVector<StringRef> filesToLink(libs);
+  auto target = builder.getAttr<xevm::XeVMTargetAttr>(
+      optLevel, triple, chip, getFlags(builder),
+      filesToLink.empty() ? nullptr : builder.getStrArrayAttr(filesToLink));
+  llvm::Regex matcher(moduleMatcher);
+  for (Region &region : getOperation()->getRegions())
+    for (Block &block : region.getBlocks())
+      for (auto module : block.getOps<gpu::GPUModuleOp>()) {
+        // Check if the name of the module matches.
+        if (!moduleMatcher.empty() && !matcher.match(module.getName()))
+          continue;
+        // Create the target array.
+        SmallVector<Attribute> targets;
+        if (std::optional<ArrayAttr> attrs = module.getTargets())
+          targets.append(attrs->getValue().begin(), attrs->getValue().end());
+        targets.push_back(target);
+        // Remove any duplicate targets.
+        targets.erase(llvm::unique(targets), targets.end());
+        // Update the target attribute array.
+        module.setTargetsAttr(builder.getArrayAttr(targets));
+      }
+}
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 80c807e774a7e..f2eab62b286af 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -293,22 +293,15 @@ static LogicalResult checkVarAndVarType(Op op) {
   if (!op.getVar())
     return op.emitError("must have var operand");
 
-  if (mlir::isa<mlir::acc::PointerLikeType>(op.getVar().getType()) &&
-      mlir::isa<mlir::acc::MappableType>(op.getVar().getType())) {
-    // TODO: If a type implements both interfaces (mappable and pointer-like),
-    // it is unclear which semantics to apply without additional info which
-    // would need captured in the data operation. For now restrict this case
-    // unless a compelling reason to support disambiguating between the two.
-    return op.emitError("var must be mappable or pointer-like (not both)");
-  }
-
+  // A variable must have a type that is either pointer-like or mappable.
   if (!mlir::isa<mlir::acc::PointerLikeType>(op.getVar().getType()) &&
       !mlir::isa<mlir::acc::MappableType>(op.getVar().getType()))
     return op.emitError("var must be mappable or pointer-like");
 
-  if (mlir::isa<mlir::acc::MappableType>(op.getVar().getType()) &&
-      op.getVarType() != op.getVar().getType())
-    return op.emitError("varType must match when var is mappable");
+  // When it is a pointer-like type, the varType must capture the target type.
+  if (mlir::isa<mlir::acc::PointerLikeType>(op.getVar().getType()) &&
+      op.getVarType() == op.getVar().getType())
+    return op.emitError("varType must capture the element type of var");
 
   return success();
 }
diff --git a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
index 1a8f30dd39871..b9aa7b7491abf 100644
--- a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
@@ -7,9 +7,9 @@ add_mlir_dialect_library(MLIRSPIRVDialect
   CastOps.cpp
   ControlFlowOps.cpp
   CooperativeMatrixOps.cpp
+  DotProductOps.cpp
   GroupOps.cpp
   ImageOps.cpp
-  IntegerDotProductOps.cpp
   MemoryOps.cpp
   MeshOps.cpp
   SPIRVAttributes.cpp
diff --git a/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp b/mlir/lib/Dialect/SPIRV/IR/DotProductOps.cpp
similarity index 83%
rename from mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp
rename to mlir/lib/Dialect/SPIRV/IR/DotProductOps.cpp
index f5676f36a0f5f..01ef1bdc42515 100644
--- a/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/DotProductOps.cpp
@@ -1,4 +1,4 @@
-//===- IntegerDotProductOps.cpp - MLIR SPIR-V Integer Dot Product Ops  ----===//
+//===- DotProductOps.cpp - MLIR SPIR-V Dot Product Ops  -------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines the Integer Dot Product operations in the SPIR-V dialect.
+// Defines the Dot Product operations in the SPIR-V dialect.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,6 +21,44 @@ using namespace mlir::spirv::AttrNames;
 
 namespace mlir::spirv {
 
+//===----------------------------------------------------------------------===//
+// Dot Product ops
+//===----------------------------------------------------------------------===//
+
+static std::optional<spirv::Version> getDotProductMinVersion() {
+  return spirv::Version::V_1_0; // Available in SPIR-V >= 1.0.
+}
+
+static std::optional<spirv::Version> getDotProductMaxVersion() {
+  return spirv::Version::V_1_6; // Available in SPIR-V <= 1.6.
+}
+
+SmallVector<ArrayRef<spirv::Extension>, 1> DotOp::getExtensions() {
+  if (isa<BFloat16Type>(getType())) {
+    static const auto extension = spirv::Extension::SPV_KHR_bfloat16;
+    return {extension};
+  }
+
+  return {};
+}
+
+SmallVector<ArrayRef<spirv::Capability>, 1> DotOp::getCapabilities() {
+  if (isa<BFloat16Type>(getType())) {
+    static const auto capability = spirv::Capability::BFloat16DotProductKHR;
+    return {capability};
+  }
+
+  return {};
+}
+
+std::optional<spirv::Version> DotOp::getMinVersion() {
+  return getDotProductMinVersion();
+}
+
+std::optional<spirv::Version> DotOp::getMaxVersion() {
+  return getDotProductMaxVersion();
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Dot Product ops
 //===----------------------------------------------------------------------===//
@@ -71,14 +109,6 @@ static LogicalResult verifyIntegerDotProduct(Operation *op) {
   return success();
 }
 
-static std::optional<spirv::Version> getIntegerDotProductMinVersion() {
-  return spirv::Version::V_1_0; // Available in SPIR-V >= 1.0.
-}
-
-static std::optional<spirv::Version> getIntegerDotProductMaxVersion() {
-  return spirv::Version::V_1_6; // Available in SPIR-V <= 1.6.
-}
-
 static SmallVector<ArrayRef<spirv::Extension>, 1>
 getIntegerDotProductExtensions() {
   // Requires the SPV_KHR_integer_dot_product extension, specified either
@@ -136,10 +166,10 @@ getIntegerDotProductCapabilities(Operation *op) {
     return getIntegerDotProductCapabilities<OpName>(*this);                    \
   }                                                                            \
   std::optional<spirv::Version> OpName::getMinVersion() {                      \
-    return getIntegerDotProductMinVersion();                                   \
+    return getDotProductMinVersion();                                          \
   }                                                                            \
   std::optional<spirv::Version> OpName::getMaxVersion() {                      \
-    return getIntegerDotProductMaxVersion();                                   \
+    return getDotProductMaxVersion();                                          \
   }
 
 SPIRV_IMPL_INTEGER_DOT_PRODUCT_OP(SDotOp)
diff --git a/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp
index fc93f1c1c9220..26406ceef082c 100644
--- a/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp
+++ b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp
@@ -11,10 +11,6 @@
 #include "mlir/Dialect/Tensor/IR/ShardingInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/DialectRegistry.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "tensor-sharding-impl"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
 
 using namespace mlir;
 using namespace mlir::tensor;
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 5b65e47bc937b..2c9cd87f14af2 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
index 3e3422162a8da..4ec13e189f621 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
@@ -10,7 +10,6 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index a3e863254405c..b035a53692dcf 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -35,11 +35,8 @@
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
-#include <algorithm>
 #include <optional>
-#include <vector>
 
 using namespace mlir;
 using namespace mlir::tensor;
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 92540bd56ecbc..437bc5d00faa8 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -10,15 +10,11 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 using namespace mlir;
 using namespace mlir::tensor;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 829b2ab92ac24..47b41efbed83b 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ConcatOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ConcatOpPatterns.cpp
index a2a860fcb38ab..20bed05ecc11d 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ConcatOpPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ConcatOpPatterns.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
index fa748cf01977f..3c2b0ab42f7a6 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
 
 using namespace mlir;
 using namespace mlir::tensor;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ExtractSliceFromReshapeUtils.cpp b/mlir/lib/Dialect/Tensor/Transforms/ExtractSliceFromReshapeUtils.cpp
index e0acaee9f6626..dd50ae54d17cc 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ExtractSliceFromReshapeUtils.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ExtractSliceFromReshapeUtils.cpp
@@ -11,11 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/TransformUtils.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index a787b485f7162..13de55b0672a5 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -16,14 +16,11 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Passes.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/TypeSwitch.h"
 #include <type_traits>
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
index 4655fa3cf0d23..bad56d4111dca 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
index 657624b817af2..20bb4d1caf019 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
@@ -13,7 +13,6 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
index 6138821ee8c61..6e3285abffbfc 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 289296a07d9d3..3f6258b5e4d43 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -13,10 +13,8 @@
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
index be29298a35aeb..d3a5f44798106 100644
--- a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/DialectRegistry.h"
-#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "tosa-sharding-impl"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 1d21096e8920b..2dd45d27157cb 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -15,20 +15,14 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
-#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
-#include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/InliningUtils.h"
-#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/TypeSwitch.h"
 
 #include <functional>
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 5170a11523845..4a952ac062cad 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -22,12 +22,10 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 #include <numeric>
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
index 9b4cf85c480d3..f6caa2a985a4d 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Pass/Pass.h"
 
 using namespace mlir;
 using namespace mlir::tosa;
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index ea6ac981b53cc..df6d52615478e 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -18,8 +18,6 @@
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
-#include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
-#include "mlir/Pass/Pass.h"
 
 using namespace mlir;
 using namespace mlir::tosa;
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
index 9c6658c9a5bf8..d33ebe397cd35 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
@@ -20,9 +20,6 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectResourceBlobManager.h"
 #include "mlir/IR/Matchers.h"
-#include "mlir/Pass/Pass.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/SmallVector.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
index 29ec9f8db2615..a9e98c8908e15 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
@@ -18,9 +18,7 @@
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp
index f4ce950828646..aae1ba359e859 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp
@@ -13,8 +13,6 @@
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
index 7997753469527..8f96fc1b80abe 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
@@ -11,12 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
-#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp
index 2092379e65368..ec1865a3bede1 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp
@@ -15,8 +15,6 @@
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
index 7f85cd52f6bde..8ebbbc94eb6a2 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
@@ -72,9 +72,7 @@
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
 #include "mlir/IR/Iterators.h"
-#include "mlir/IR/Matchers.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include <memory>
 #include <set>
 #include <stack>
 
diff --git a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
index 12257da878a40..a963b3f063a8a 100644
--- a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp
@@ -8,9 +8,7 @@
 
 #include "mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h"
 
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
-#include "mlir/IR/OpImplementation.h"
 #include "llvm/Support/InterleavedRange.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
index 4a95fe7459e8c..a500228d68c77 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Analysis/CallGraph.h"
-#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 0db0317461c03..9266a63a0038a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
@@ -23,11 +22,9 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Verifier.h"
-#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/CSE.h"
diff --git a/mlir/lib/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp b/mlir/lib/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp
index 34d6221d15fb0..95870e8ef87be 100644
--- a/mlir/lib/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp
@@ -8,8 +8,6 @@
 
 #include "mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h"
 
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp b/mlir/lib/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp
index 85f61245eb734..41955c8a278f2 100644
--- a/mlir/lib/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h"
 #include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/OpImplementation.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Rewrite/PatternApplicator.h"
 #include "llvm/ADT/ScopeExit.h"
diff --git a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
index bfe1d9682177d..18dfd504203a9 100644
--- a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
@@ -15,7 +15,6 @@
 
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Pass/Pass.h"
 #include "llvm/ADT/SetOperations.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp b/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
index 20db09ca9e8d5..364453431db7e 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
@@ -12,7 +12,6 @@
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/ADT/DenseSet.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
index 44d82714b894b..35ace1b2e0c3a 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
@@ -18,11 +18,9 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/IR/Visitors.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index bcaea1c79471f..fe2707629d82e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1088,6 +1088,12 @@ class ExtractOpFromElementwise final
     if (!llvm::all_equal(eltwise->getOperandTypes()))
       return rewriter.notifyMatchFailure(op, "operand types are different");
 
+    // Dynamic position can cause dominance issues, so conservatively fail for
+    // now.
+    if (!op.getDynamicPosition().empty())
+      return rewriter.notifyMatchFailure(
+          op, "dynamic position not yet implemented");
+
     Type dstType = op.getType();
 
     OpBuilder::InsertionGuard g(rewriter);
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
index cc7ab7f3f3895..179e6ee8784e6 100644
--- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
+++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -86,6 +86,29 @@ x86vector::DotOp::getIntrinsicOperands(ArrayRef<Value> operands,
   return intrinsicOperands;
 }
 
+SmallVector<Value> x86vector::DotInt8Op::getIntrinsicOperands(
+    ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
+    RewriterBase &rewriter) {
+  SmallVector<Value> intrinsicOprnds;
+  Adaptor adaptor(operands, *this);
+  intrinsicOprnds.push_back(adaptor.getW());
+  // Bitcast `a` and `b` to i32
+  Value bitcast_a = rewriter.create<LLVM::BitcastOp>(
+      getLoc(),
+      VectorType::get((getA().getType().getShape()[0] / 4),
+                      rewriter.getIntegerType(32)),
+      adaptor.getA());
+  intrinsicOprnds.push_back(bitcast_a);
+  Value bitcast_b = rewriter.create<LLVM::BitcastOp>(
+      getLoc(),
+      VectorType::get((getB().getType().getShape()[0] / 4),
+                      rewriter.getIntegerType(32)),
+      adaptor.getB());
+  intrinsicOprnds.push_back(bitcast_b);
+
+  return intrinsicOprnds;
+}
+
 SmallVector<Value> x86vector::BcstToPackedF32Op::getIntrinsicOperands(
     ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
     RewriterBase &rewriter) {
diff --git a/mlir/lib/TableGen/Class.cpp b/mlir/lib/TableGen/Class.cpp
index c65f67d50a47d..81f1aee73a7f0 100644
--- a/mlir/lib/TableGen/Class.cpp
+++ b/mlir/lib/TableGen/Class.cpp
@@ -159,6 +159,38 @@ void Method::writeDefTo(raw_indented_ostream &os, StringRef namePrefix) const {
   os << "}\n\n";
 }
 
+bool Method::methodPropertiesAreCompatible(Properties properties) {
+  const bool isStatic = (properties & Method::Static);
+  const bool isConstructor = (properties & Method::Constructor);
+  // const bool isPrivate = (properties & Method::Private);
+  const bool isDeclaration = (properties & Method::Declaration);
+  const bool isInline = (properties & Method::Inline);
+  const bool isConstexprValue = (properties & Method::ConstexprValue);
+  const bool isConst = (properties & Method::Const);
+
+  // Note: assert to immediately fail and thus simplify debugging.
+  if (isStatic && isConstructor) {
+    assert(false && "constructor cannot be static");
+    return false;
+  }
+  if (isConstructor && isConst) { // albeit constexpr is fine
+    assert(false && "constructor cannot be const");
+    return false;
+  }
+  if (isDeclaration && isInline) {
+    assert(false &&
+           "declaration implies no definition and thus cannot be inline");
+    return false;
+  }
+  if (isDeclaration && isConstexprValue) {
+    assert(false &&
+           "declaration implies no definition and thus cannot be constexpr");
+    return false;
+  }
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Constructor definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 336f71ccd06a3..68a8d1758e434 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3095,6 +3095,67 @@ convertOmpLoopNest(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Convert an omp.canonical_loop to LLVM-IR
+static LogicalResult
+convertOmpCanonicalLoopOp(omp::CanonicalLoopOp op, llvm::IRBuilderBase &builder,
+                          LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+
+  llvm::OpenMPIRBuilder::LocationDescription loopLoc(builder);
+  Value loopIV = op.getInductionVar();
+  Value loopTC = op.getTripCount();
+
+  llvm::Value *llvmTC = moduleTranslation.lookupValue(loopTC);
+
+  llvm::Expected<llvm::CanonicalLoopInfo *> llvmOrError =
+      ompBuilder->createCanonicalLoop(
+          loopLoc,
+          [&](llvm::OpenMPIRBuilder::InsertPointTy ip, llvm::Value *llvmIV) {
+            // Register the mapping of MLIR induction variable to LLVM-IR
+            // induction variable
+            moduleTranslation.mapValue(loopIV, llvmIV);
+
+            builder.restoreIP(ip);
+            llvm::Expected<llvm::BasicBlock *> bodyGenStatus =
+                convertOmpOpRegions(op.getRegion(), "omp.loop.region", builder,
+                                    moduleTranslation);
+
+            return bodyGenStatus.takeError();
+          },
+          llvmTC, "omp.loop");
+  if (!llvmOrError)
+    return op.emitError(llvm::toString(llvmOrError.takeError()));
+
+  llvm::CanonicalLoopInfo *llvmCLI = *llvmOrError;
+  llvm::IRBuilderBase::InsertPoint afterIP = llvmCLI->getAfterIP();
+  builder.restoreIP(afterIP);
+
+  // Register the mapping of MLIR loop to LLVM-IR OpenMPIRBuilder loop
+  if (Value cli = op.getCli())
+    moduleTranslation.mapOmpLoop(cli, llvmCLI);
+
+  return success();
+}
+
+/// Apply a `#pragma omp unroll` / "!$omp unroll" transformation using the
+/// OpenMPIRBuilder.
+static LogicalResult
+applyUnrollHeuristic(omp::UnrollHeuristicOp op, llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+
+  Value applyee = op.getApplyee();
+  assert(applyee && "Loop to apply unrolling on required");
+
+  llvm::CanonicalLoopInfo *consBuilderCLI =
+      moduleTranslation.lookupOMPLoop(applyee);
+  llvm::OpenMPIRBuilder::LocationDescription loc(builder);
+  ompBuilder->unrollLoopHeuristic(loc.DL, consBuilderCLI);
+
+  moduleTranslation.invalidateOmpLoop(applyee);
+  return success();
+}
+
 /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
 static llvm::AtomicOrdering
 convertAtomicOrdering(std::optional<omp::ClauseMemoryOrderKind> ao) {
@@ -5989,6 +6050,23 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
                 // etc. and then discarded
                 return success();
               })
+          .Case([&](omp::NewCliOp op) {
+            // Meta-operation: Doesn't do anything by itself, but used to
+            // identify a loop.
+            return success();
+          })
+          .Case([&](omp::CanonicalLoopOp op) {
+            return convertOmpCanonicalLoopOp(op, builder, moduleTranslation);
+          })
+          .Case([&](omp::UnrollHeuristicOp op) {
+            // FIXME: Handling omp.unroll_heuristic as an executable requires
+            // that the generator (e.g. omp.canonical_loop) has been seen first.
+            // For construct that require all codegen to occur inside a callback
+            // (e.g. OpenMPIRBilder::createParallel), all codegen of that
+            // contained region including their transformations must occur at
+            // the omp.canonical_loop.
+            return applyUnrollHeuristic(op, builder, moduleTranslation);
+          })
           .Default([&](Operation *inst) {
             return inst->emitError()
                    << "not yet implemented: " << inst->getName();
diff --git a/mlir/python/mlir/dialects/TransformTuneExtensionOps.td b/mlir/python/mlir/dialects/TransformTuneExtensionOps.td
index ff3047592ab12..c622c31e2c736 100644
--- a/mlir/python/mlir/dialects/TransformTuneExtensionOps.td
+++ b/mlir/python/mlir/dialects/TransformTuneExtensionOps.td
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
-#define PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
+#ifndef PYTHON_BINDINGS_TRANSFORM_TUNE_EXTENSION_OPS
+#define PYTHON_BINDINGS_TRANSFORM_TUNE_EXTENSION_OPS
 
 include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td"
 
-#endif // PYTHON_BINDINGS_TRANSFORM_DEBUG_EXTENSION_OPS
+#endif // PYTHON_BINDINGS_TRANSFORM_TUNE_EXTENSION_OPS
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index 99ab0e1dc4eef..27fd74e12d36e 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -967,6 +967,22 @@ func.func @reduction_minui(%v : vector<3xi32>, %s: i32) -> i32 {
 
 // -----
 
+module attributes { spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [BFloat16DotProductKHR], [SPV_KHR_bfloat16]>, #spirv.resource_limits<>> } {
+
+// CHECK-LABEL: func @reduction_bf16_addf_mulf
+//  CHECK-SAME:  (%[[ARG0:.+]]: vector<4xbf16>, %[[ARG1:.+]]: vector<4xbf16>)
+//  CHECK:       %[[DOT:.+]] = spirv.Dot %[[ARG0]], %[[ARG1]] : vector<4xbf16> -> bf16
+//  CHECK:       return %[[DOT]] : bf16
+func.func @reduction_bf16_addf_mulf(%arg0: vector<4xbf16>, %arg1: vector<4xbf16>) -> bf16 {
+  %mul = arith.mulf %arg0, %arg1 : vector<4xbf16>
+  %red = vector.reduction <add>, %mul : vector<4xbf16> into bf16
+  return %red : bf16
+}
+
+} // end module
+
+// -----
+
 // CHECK-LABEL: @shape_cast_same_type
 //  CHECK-SAME: (%[[ARG0:.*]]: vector<2xf32>)
 //       CHECK:   return %[[ARG0]]
diff --git a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
new file mode 100644
index 0000000000000..bdbb12bbe0cbb
--- /dev/null
+++ b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
@@ -0,0 +1,244 @@
+// RUN: mlir-opt --convert-xevm-to-llvm --split-input-file %s | FileCheck %s
+
+// Same below, but using the `ConvertToLLVMPatternInterface` entry point
+// and the generic `convert-to-llvm` pass.
+// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL:      llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
+// CHECK-SAME:   !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+// CHECK-SAME:   !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
+// CHECK:      llvm.func @blockload2d(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
+llvm.func @blockload2d(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi16> {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
+  // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>,
+  // CHECK-SAME:   linkage = #llvm.linkage<external>, no_unwind, sym_name =
+  // CHECK-SAME:   "_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return} :
+  // CHECK-SAME: (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+  // CHECK-SAME:  !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
+  // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR6]] : !llvm.ptr -> vector<8xi16>
+  %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+    <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false,
+      pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return %loaded_a : vector<8xi16>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(
+llvm.func @blockload2d_cache_control(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi16> {
+  // CHECK: xevm.DecorationCacheControl = 
+  // CHECK-SAME: 6442 : i32, 0 : i32, 1 : i32, 0 : i32
+  // CHECK-SAME: 6442 : i32, 1 : i32, 1 : i32, 0 : i32
+  %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+    <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false,
+      pack_register=false, cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+  llvm.return %loaded_a : vector<8xi16>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(
+// CHECK-SAME:   !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+// CHECK-SAME:   !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
+// CHECK:      llvm.func @blockload2d_v_blocks(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
+llvm.func @blockload2d_v_blocks(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<16xi16> {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(16 : i32) : i32
+  // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i16 : (i32) -> !llvm.ptr
+  // CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>,
+  // CHECK-SAME:   linkage = #llvm.linkage<external>, no_unwind, sym_name =
+  // CHECK-SAME:   "_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return}
+  // CHECK-SAME: (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+  // CHECK-SAME:  !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
+  // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR6]] : !llvm.ptr -> vector<16xi16>
+  %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+    <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=2 : i32, transpose=false,
+      pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+  llvm.return %loaded_a : vector<16xi16>
+}
+
+// -----
+// CHECK-LABEL:  llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_16r16x1cPU3AS1viiiDv2_iPj(
+// CHECK-SAME: !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+// CHECK-SAME: !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
+// CHECK:      llvm.func @blockload2d_pack_register(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
+llvm.func @blockload2d_pack_register(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
+  // CHECK: llvm.call spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_16r16x1cPU3AS1viiiDv2_iPj(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>,
+  // CHECK-SAME:   linkage = #llvm.linkage<external>, no_unwind, sym_name =
+  // CHECK-SAME:   "_Z52intel_sub_group_2d_block_read_transform_16b_16r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return} :
+  // CHECK-SAME: (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+  // CHECK-SAME:  !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
+  // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR6]] : !llvm.ptr -> vector<8xi32>
+  %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+    <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false,
+      pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+  llvm.return %loaded_a : vector<8xi32>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj(
+// CHECK-SAME:   !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+// CHECK-SAME:   !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
+// CHECK:      llvm.func @blockload2d_transpose(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:   %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32)
+llvm.func @blockload2d_transpose(%a: !llvm.ptr<1>, %base_width_a: i32, %base_height_a: i32, %base_pitch_a: i32, %x: i32, %y: i32) -> vector<8xi32> {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
+  // CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>,
+  // CHECK-SAME:   linkage = #llvm.linkage<external>, no_unwind, sym_name =
+  // CHECK-SAME:   "_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return}
+  // CHECK-SAME: (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>,
+  // CHECK-SAME:  !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> ()
+  // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR6]] : !llvm.ptr -> vector<8xi32>
+  %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+    <{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=true,
+      pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+  llvm.return %loaded_a : vector<8xi32>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(
+// CHECK-SAME: !llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>,
+// CHECK-SAME: !llvm.ptr {llvm.nonnull, llvm.readonly}) attributes {no_unwind, will_return}
+// CHECK: llvm.func @blockstore2d(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME: %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32, %[[ARG6:.*]]: vector<8xi32>) {
+llvm.func @blockstore2d(%c: !llvm.ptr<1>, %base_width_c: i32, %base_height_c: i32, %base_pitch_c: i32, %x: i32, %y: i32, %c_result_casted: vector<8xi32>) {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR5:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[VAR6:.*]] = llvm.alloca %[[VAR5]] x i32 : (i32) -> !llvm.ptr
+  // CHECK: llvm.store %[[ARG6]], %[[VAR6]] : vector<8xi32>, !llvm.ptr
+  // CHECK: llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]], %[[VAR6]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>, ptr)>,
+  // CHECK-SAME:   linkage = #llvm.linkage<external>, no_unwind, sym_name =
+  // CHECK-SAME:   "_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return}
+  // CHECK-SAME: : (!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>,
+  // CHECK-SAME:    !llvm.ptr {llvm.nonnull, llvm.readonly}) -> ()
+  xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted
+    <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}>
+    : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(
+llvm.func @blockstore2d_cache_control(%c: !llvm.ptr<1>, %base_width_c: i32, %base_height_c: i32, %base_pitch_c: i32, %x: i32, %y: i32, %c_result_casted: vector<8xi32>) {
+  // CHECK: xevm.DecorationCacheControl =
+  // CHECK-SAME: 6443 : i32, 0 : i32, 2 : i32, 0 : i32
+  // CHECK-SAME: 6443 : i32, 1 : i32, 2 : i32, 0 : i32
+  xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted
+    <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, cache_control = #xevm.store_cache_control<L1wt_L2uc_L3wb>}>
+    : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(
+// CHECK-SAME: !llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes
+// CHECK-SAME: {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+// CHECK: llvm.func @blockprefetch2d(%[[ARG0:.*]]: !llvm.ptr<1>,
+// CHECK-SAME: %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32, %[[ARG4:.*]]: i32, %[[ARG5:.*]]: i32) {
+llvm.func @blockprefetch2d(%ptr: !llvm.ptr<1>, %base_width: i32, %base_height: i32, %base_pitch: i32, %x: i32, %y: i32) {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.undef : vector<2xi32>
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR3:.*]] = llvm.insertelement %[[ARG4]], %[[VAR0]][%[[VAR1]] : i32] : vector<2xi32>
+  // CHECK: %[[VAR4:.*]] = llvm.insertelement %[[ARG5]], %[[VAR3]][%[[VAR2]] : i32] : vector<2xi32>
+  // CHECK: llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(
+  // CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[VAR4]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i32, i32, i32, vector<2xi32>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind,
+  // CHECK-SAME:   sym_name = "_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i", visibility_ = 0 : i64
+  xevm.blockprefetch2d %ptr, %base_width, %base_height, %base_pitch, %x, %y
+    <{elem_size_in_bits=8 : i32, tile_width=32 : i32, tile_height=8 : i32, v_blocks=1 : i32,
+      cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}>
+    : (!llvm.ptr<1>, i32, i32, i32, i32, i32)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(
+// CHECK-SAME: vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes
+// CHECK-SAME: {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none,
+// CHECK-SAME:   inaccessibleMem = none>, no_unwind, will_return}
+// CHECK: llvm.func @mma(%[[ARG0:.*]]: vector<8xf32>, %[[ARG1:.*]]: vector<8xi16>, %[[ARG2:.*]]: vector<8xi32>) -> vector<8xf32> {
+llvm.func @mma(%loaded_c_casted: vector<8xf32>, %loaded_a: vector<8xi16>, %loaded_b_casted: vector<8xi32>) -> vector<8xf32> {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(
+  // CHECK-SAME: %[[ARG1]], %[[ARG2]], %[[ARG0]]) {convergent, function_type =
+  // CHECK-SAME:   !llvm.func<vector<8xf32> (vector<8xi16>, vector<8xi32>, vector<8xf32>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind,
+  // CHECK-SAME:   sym_name = "_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f", visibility_ = 0 : i64, will_return}
+  // CHECK-SAME: : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
+  %c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted
+    { shape=<m=8, n=16, k=16>, types=<d=f32, a=f16, b=f16, c=f32> }
+    : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
+  llvm.return %c_result : vector<8xf32>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z22atomic_work_item_fenceiii(i32, i32, i32) attributes {no_unwind}
+llvm.func @memfence() {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[VAR1:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[VAR2:.*]] = llvm.mlir.constant(2 : i32) : i32
+  // CHECK: llvm.call spir_funccc @_Z22atomic_work_item_fenceiii(%[[VAR2]], %[[VAR0]], %[[VAR1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (i32, i32, i32)>, linkage = #llvm.linkage<external>, no_unwind,
+  // CHECK-SAME:   sym_name = "_Z22atomic_work_item_fenceiii", visibility_ = 0 : i64} : (i32, i32, i32) -> ()
+  xevm.memfence <{addrspace=#xevm.addr_space<global>, scope=#xevm.mem_scope<workgroup>}>
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) attributes
+// CHECK-SAME: {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+// CHECK: llvm.func @prefetch(%[[ARG0:.*]]: !llvm.ptr<1>) {
+llvm.func @prefetch(%ptr: !llvm.ptr<1>) {
+  // CHECK: %[[VAR0:.*]] = llvm.mlir.constant(1 : i64) : i64
+  // CHECK: llvm.call spir_funccc @_Z8prefetchPU3AS1Kcm(%[[ARG0]], %[[VAR0]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, i64)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z8prefetchPU3AS1Kcm", visibility_ = 0 : i64
+  xevm.prefetch %ptr <{cache_control = #xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<1>)
+  llvm.return
+}
+
diff --git a/mlir/test/Dialect/LLVMIR/attach-targets.mlir b/mlir/test/Dialect/LLVMIR/attach-targets.mlir
index 83733db400798..d1112f7411aae 100644
--- a/mlir/test/Dialect/LLVMIR/attach-targets.mlir
+++ b/mlir/test/Dialect/LLVMIR/attach-targets.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s --nvvm-attach-target='module=nvvm.* O=3 chip=sm_90' --rocdl-attach-target='module=rocdl.* O=3 chip=gfx90a' | FileCheck %s
-// RUN: mlir-opt %s --nvvm-attach-target='module=options.* O=1 chip=sm_70 fast=true ftz=true' --rocdl-attach-target='module=options.* l=file1.bc,file2.bc wave64=false finite-only=true' | FileCheck %s --check-prefix=CHECK_OPTS
+// RUN: mlir-opt %s --nvvm-attach-target='module=nvvm.* O=3 chip=sm_90' --rocdl-attach-target='module=rocdl.* O=3 chip=gfx90a' --xevm-attach-target='module=xevm.* O=3 chip=pvc' | FileCheck %s
+// RUN: mlir-opt %s --nvvm-attach-target='module=options.* O=1 chip=sm_70 fast=true ftz=true' --rocdl-attach-target='module=options.* l=file1.bc,file2.bc wave64=false finite-only=true' --xevm-attach-target='module=options.* O=1 chip=pvc' | FileCheck %s --check-prefix=CHECK_OPTS
 
 module attributes {gpu.container_module} {
 // Verify the target is appended.
@@ -18,12 +18,21 @@ gpu.module @nvvm_module_3 [#nvvm.target<O = 3, chip = "sm_90">] {
 // CHECK: @rocdl_module [#rocdl.target<O = 3, chip = "gfx90a">] {
 gpu.module @rocdl_module {
 }
+// Verify that other targets are not added as they fail to match the regex, but XeVM does get appended.
+// CHECK: @xevm_module [#xevm.target<O = 3, chip = "pvc">] {
+gpu.module @xevm_module {
+}
 // Check the options were added.
-// CHECK_OPTS: @options_module_1 [#nvvm.target<O = 1, chip = "sm_70", flags = {fast, ftz}>, #rocdl.target<flags = {finite_only, no_wave64}, link = ["file1.bc", "file2.bc"]>]  {
+// CHECK_OPTS: @options_module_1 [#nvvm.target<O = 1, chip = "sm_70", flags = {fast, ftz}>,
+// CHECK_OPTS-SAME: #rocdl.target<flags = {finite_only, no_wave64}, link = ["file1.bc", "file2.bc"]>,
+// CHECK_OPTS-SAME: #xevm.target<O = 1, chip = "pvc">]  {
 gpu.module @options_module_1 {
 }
 // Check the options were added and that the first target was preserved.
-// CHECK_OPTS: @options_module_2 [#nvvm.target<O = 3, chip = "sm_90">, #nvvm.target<O = 1, chip = "sm_70", flags = {fast, ftz}>, #rocdl.target<flags = {finite_only, no_wave64}, link = ["file1.bc", "file2.bc"]>]  {
+// CHECK_OPTS: @options_module_2 [#nvvm.target<O = 3, chip = "sm_90">,
+// CHECK_OPTS-SAME: #nvvm.target<O = 1, chip = "sm_70", flags = {fast, ftz}>,
+// CHECK_OPTS-SAME: #rocdl.target<flags = {finite_only, no_wave64}, link = ["file1.bc", "file2.bc"]>,
+// CHECK_OPTS-SAME: #xevm.target<O = 1, chip = "pvc">]  {
 gpu.module @options_module_2 [#nvvm.target<O = 3, chip = "sm_90">] {
 }
 }
diff --git a/mlir/test/Dialect/LLVMIR/cse-nvvm.mlir b/mlir/test/Dialect/LLVMIR/cse-nvvm.mlir
new file mode 100644
index 0000000000000..8d24c3846f178
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/cse-nvvm.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s -cse -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @nvvm_special_regs_clock
+llvm.func @nvvm_special_regs_clock() -> !llvm.struct<(i32, i32)> {
+  %0 = llvm.mlir.zero: !llvm.struct<(i32, i32)>
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.clock
+  %1 = nvvm.read.ptx.sreg.clock : i32
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.clock
+  %2 = nvvm.read.ptx.sreg.clock : i32
+  %4 = llvm.insertvalue %1, %0[0]: !llvm.struct<(i32, i32)>
+  %5 = llvm.insertvalue %2, %4[1]: !llvm.struct<(i32, i32)>
+  llvm.return %5: !llvm.struct<(i32, i32)>
+}
+
+// CHECK-LABEL: @nvvm_special_regs_clock64
+llvm.func @nvvm_special_regs_clock64() -> !llvm.struct<(i64, i64)> {
+  %0 = llvm.mlir.zero: !llvm.struct<(i64, i64)>
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.clock64
+  %1 = nvvm.read.ptx.sreg.clock64 : i64
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.clock64
+  %2 = nvvm.read.ptx.sreg.clock64 : i64
+  %4 = llvm.insertvalue %1, %0[0]: !llvm.struct<(i64, i64)>
+  %5 = llvm.insertvalue %2, %4[1]: !llvm.struct<(i64, i64)>
+  llvm.return %5: !llvm.struct<(i64, i64)>
+}
+
+// CHECK-LABEL: @nvvm_special_regs_globaltimer
+llvm.func @nvvm_special_regs_globaltimer() -> !llvm.struct<(i64, i64)> {
+  %0 = llvm.mlir.zero: !llvm.struct<(i64, i64)>
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.globaltimer
+  %1 = nvvm.read.ptx.sreg.globaltimer : i64
+  // CHECK:  {{.*}} = nvvm.read.ptx.sreg.globaltimer
+  %2 = nvvm.read.ptx.sreg.globaltimer : i64
+  %4 = llvm.insertvalue %1, %0[0]: !llvm.struct<(i64, i64)>
+  %5 = llvm.insertvalue %2, %4[1]: !llvm.struct<(i64, i64)>
+  llvm.return %5: !llvm.struct<(i64, i64)>
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
index d58c27598f2b8..3adafc15c79f6 100644
--- a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
@@ -321,6 +321,15 @@ func.func @dot(%arg0: vector<4xf32>, %arg1: vector<4xf32>) -> f32 {
 
 // -----
 
+// CHECK-LABEL: @dot_bf16
+func.func @dot_bf16(%arg0: vector<4xbf16>, %arg1: vector<4xbf16>) -> bf16 {
+  // CHECK: spirv.Dot %{{.+}}, %{{.+}} : vector<4xbf16> -> bf16
+  %0 = spirv.Dot %arg0, %arg1 : vector<4xbf16> -> bf16
+  return %0 : bf16
+}
+
+// -----
+
 // expected-note @+1 {{prior use here}}
 func.func @dot(%arg0: vector<4xf32>, %arg1: vector<3xf32>) -> f32 {
   // expected-error @+1 {{use of value '%arg1' expects different type than prior uses}}
@@ -339,7 +348,7 @@ func.func @dot(%arg0: vector<4xf32>, %arg1: vector<4xf32>) -> f16 {
 // -----
 
 func.func @dot(%arg0: vector<4xi32>, %arg1: vector<4xi32>) -> i32 {
-  // expected-error @+1 {{'spirv.Dot' op operand #0 must be vector of 16/32/64-bit float values of length 2/3/4/8/16}}
+  // expected-error @+1 {{'spirv.Dot' op operand #0 must be vector of 16/32/64-bit float or BFloat16 values of length 2/3/4/8/16}}
   %0 = spirv.Dot %arg0, %arg1 : vector<4xi32> -> i32
   return %0 : i32
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/availability.mlir b/mlir/test/Dialect/SPIRV/IR/availability.mlir
index 64ba8e3fc249e..9c8665b1e4bbe 100644
--- a/mlir/test/Dialect/SPIRV/IR/availability.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/availability.mlir
@@ -234,6 +234,20 @@ func.func @udot_acc_sat_vector_4xi16_i64(%a: vector<4xi16>, %acc: i64) -> i64 {
   return %r: i64
 }
 
+//===----------------------------------------------------------------------===//
+// Dot Product op with bfloat16
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: dot_vector_4xbf16_bf16
+func.func @dot_vector_4xbf16_bf16(%a: vector<4xbf16>, %b: vector<4xbf16>) -> bf16 {
+  // CHECK: min version: v1.0
+  // CHECK: max version: v1.6
+  // CHECK: extensions: [ [SPV_KHR_bfloat16] ]
+  // CHECK: capabilities: [ [BFloat16DotProductKHR] ]
+  %r = spirv.Dot %a, %a: vector<4xbf16> -> bf16
+  return %r: bf16
+}
+
 //===----------------------------------------------------------------------===//
 // Primitive ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index 900ad99bb4a4c..b826cdca134e6 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -514,6 +514,18 @@ func.func @negative_extract_vec_fma(%arg0: vector<4xf32>, %arg1: vector<4xf32>,
   return %1 : f32
 }
 
+// CHECK-LABEL: @negative_extract_dynamic_pos
+func.func @negative_extract_dynamic_pos(%arg0: vector<4xf32>, %arg1 : vector<4xf32>, %idx : vector<4xindex>) -> f32 {
+  // CHECK-NOT: vector.extract
+  // CHECK: arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
+  // CHECK: vector.extract
+  // CHECK: vector.extract
+  %0 = arith.addf %arg0, %arg1 : vector<4xf32>
+  %1 = vector.extract %idx[0] : index from vector<4xindex>
+  %2 = vector.extract %0[%1] : f32 from vector<4xf32>
+  return %2 : f32
+}
+
 //-----------------------------------------------------------------------------
 // [Pattern: ExtractOpFromLoad]
 //-----------------------------------------------------------------------------
diff --git a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir
index 63f06624ef897..72dc899f4f0a6 100644
--- a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir
+++ b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir
@@ -219,3 +219,19 @@ func.func @avx_dot(%a: vector<8xf32>, %b: vector<8xf32>) -> (vector<8xf32>)
   %0 = x86vector.avx.intr.dot %a, %b : vector<8xf32>
   return %0 : vector<8xf32>
 }
+
+// CHECK-LABEL: func @avx_dot_i8_128
+func.func @avx_dot_i8_128(%w: vector<4xi32>, %a: vector<16xi8>,
+    %b: vector<16xi8>) -> vector<4xi32> {
+  // CHECK: llvm.call_intrinsic "llvm.x86.avx2.vpdpbssd.128"
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<16xi8> -> vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// CHECK-LABEL: func @avx_dot_i8_256
+func.func @avx_dot_i8_256(%w: vector<8xi32>, %a: vector<32xi8>,
+    %b: vector<32xi8>) -> vector<8xi32> {
+  // CHECK: llvm.call_intrinsic "llvm.x86.avx2.vpdpbssd.256"
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<32xi8> -> vector<8xi32>
+  return %0 : vector<8xi32>
+}
diff --git a/mlir/test/Dialect/X86Vector/roundtrip.mlir b/mlir/test/Dialect/X86Vector/roundtrip.mlir
index 7dcab3eb4dcb8..959177b27c7ea 100644
--- a/mlir/test/Dialect/X86Vector/roundtrip.mlir
+++ b/mlir/test/Dialect/X86Vector/roundtrip.mlir
@@ -229,3 +229,19 @@ func.func @avx_dot(%a: vector<8xf32>, %b: vector<8xf32>) -> (vector<8xf32>)
   %0 = x86vector.avx.intr.dot %a, %b : vector<8xf32>
   return %0 : vector<8xf32>
 }
+
+// CHECK-LABEL: func @avx_dot_i8_128
+func.func @avx_dot_i8_128(%w: vector<4xi32>, %a: vector<16xi8>,
+    %b: vector<16xi8>) -> vector<4xi32> {
+  // CHECK: x86vector.avx.dot.i8 {{.*}} : vector<16xi8> -> vector<4xi32>
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<16xi8> -> vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// CHECK-LABEL: func @avx_dot_i8_256
+func.func @avx_dot_i8_256(%w: vector<8xi32>, %a: vector<32xi8>,
+    %b: vector<32xi8>) -> vector<8xi32> {
+  // CHECK: x86vector.avx.dot.i8 {{.*}} : vector<32xi8> -> vector<8xi32>
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<32xi8> -> vector<8xi32>
+  return %0 : vector<8xi32>
+}
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir
new file mode 100644
index 0000000000000..d889ef4f5700c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir
@@ -0,0 +1,121 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1)
+  omp.declare_reduction @add_reduction_i32 : i32 init {
+  ^bb0(%arg0: i32 loc("test.f90":8:7)):
+    %0 = llvm.mlir.constant(0 : i32) : i32 loc(#loc2)
+    omp.yield(%0 : i32) loc(#loc2)
+  } combiner {
+  ^bb0(%arg0: i32 loc("test.f90":8:7), %arg1: i32 loc("test.f90":8:7)):
+    %0 = llvm.add %arg0, %arg1 : i32 loc(#loc2)
+    omp.yield(%0 : i32) loc(#loc2)
+  } loc(#loc2)
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i64) : i64 loc(#loc4)
+    %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr loc(#loc4)
+    %3 = llvm.mlir.constant(1 : i64) : i64 loc(#loc1)
+    %4 = llvm.alloca %3 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc1)
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc1)
+    %6 = llvm.mlir.constant(8191 : index) : i64 loc(#loc5)
+    %7 = llvm.mlir.constant(0 : index) : i64 loc(#loc5)
+    %8 = llvm.mlir.constant(1 : index) : i64 loc(#loc5)
+    %9 = llvm.mlir.constant(0 : i32) : i32 loc(#loc5)
+    %10 = llvm.mlir.constant(8192 : index) : i64 loc(#loc5)
+    %11 = llvm.mlir.addressof @_QFEarr : !llvm.ptr<1> loc(#loc6)
+    %12 = llvm.addrspacecast %11 : !llvm.ptr<1> to !llvm.ptr loc(#loc6)
+    llvm.store %9, %2 : i32, !llvm.ptr loc(#loc7)
+    %15 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} loc(#loc4)
+    %16 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc7)
+    %17 = omp.map.bounds lower_bound(%7 : i64) upper_bound(%6 : i64) extent(%10 : i64) stride(%8 : i64) start_idx(%8 : i64) loc(#loc7)
+    %18 = omp.map.info var_ptr(%12 : !llvm.ptr, !llvm.array<8192 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%17) -> !llvm.ptr {name = "arr"} loc(#loc7)
+    omp.target map_entries(%15 -> %arg0, %16 -> %arg1, %18 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %19 = llvm.mlir.constant(8192 : i32) : i32 loc(#loc5)
+      %20 = llvm.mlir.constant(1 : i32) : i32 loc(#loc5)
+      %21 = llvm.mlir.constant(8192 : index) : i64 loc(#loc6)
+      omp.teams reduction(@add_reduction_i32 %arg0 -> %arg3 : !llvm.ptr) {
+        omp.parallel private(@_QFEi_private_i32 %arg1 -> %arg4 : !llvm.ptr) {
+          omp.distribute {
+            omp.wsloop reduction(@add_reduction_i32 %arg3 -> %arg5 : !llvm.ptr) {
+              omp.loop_nest (%arg6) : i32 = (%20) to (%19) inclusive step (%20) {
+                llvm.store %arg6, %arg4 : i32, !llvm.ptr loc(#loc2)
+                %22 = llvm.load %arg5 : !llvm.ptr -> i32 loc(#loc8)
+                %23 = llvm.load %arg4 : !llvm.ptr -> i32 loc(#loc8)
+                %34 = llvm.add %22, %23 : i32 loc(#loc8)
+                llvm.store %34, %arg5 : i32, !llvm.ptr loc(#loc8)
+                omp.yield loc(#loc2)
+              } loc(#loc2)
+            } {omp.composite} loc(#loc2)
+          } {omp.composite} loc(#loc2)
+          omp.terminator loc(#loc2)
+        } {omp.composite} loc(#loc2)
+        omp.terminator loc(#loc2)
+      } loc(#loc2)
+      omp.terminator loc(#loc2)
+    } loc(#loc13)
+    llvm.return loc(#loc9)
+  } loc(#loc12)
+  llvm.mlir.global internal @_QFEarr() {addr_space = 1 : i32} : !llvm.array<8192 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<8192 x i32> loc(#loc6)
+    llvm.return %0 : !llvm.array<8192 x i32> loc(#loc6)
+  } loc(#loc6)
+} loc(#loc)
+
+#loc = loc("test.f90":4:18)
+#loc1 = loc("test.f90":4:18)
+#loc2 = loc("test.f90":8:7)
+#loc3 = loc("test.f90":1:7)
+#loc4 = loc("test.f90":3:18)
+#loc5 = loc(unknown)
+#loc6 = loc("test.f90":5:18)
+#loc7 = loc("test.f90":6:7)
+#loc8 = loc("test.f90":10:7)
+#loc9 = loc("test.f90":16:7)
+
+#di_file = #llvm.di_file<"target7.f90" in "">
+#di_null_type = #llvm.di_null_type
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>,
+ sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang",
+ isOptimized = false, emissionKind = LineTablesOnly>
+#di_subroutine_type = #llvm.di_subroutine_type<
+  callingConvention = DW_CC_program, types = #di_null_type>
+#di_subprogram = #llvm.di_subprogram<id = distinct[1]<>,
+  compileUnit = #di_compile_unit, scope = #di_file, name = "main",
+  file = #di_file, subprogramFlags = "Definition|MainSubprogram",
+  type = #di_subroutine_type>
+#di_subprogram1 = #llvm.di_subprogram<compileUnit = #di_compile_unit,
+  name = "target", file = #di_file, subprogramFlags = "Definition",
+  type = #di_subroutine_type>
+
+
+#loc12 = loc(fused<#di_subprogram>[#loc3])
+#loc13 = loc(fused<#di_subprogram1>[#loc2])
+
+// CHECK-DAG: define internal void @_omp_reduction_shuffle_and_reduce_func
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_inter_warp_copy_func
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @"__omp_offloading_{{.*}}__QQmain_l8_omp$reduction$reduction_func.1"
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_shuffle_and_reduce_func.2
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_inter_warp_copy_func.3
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_list_to_global_copy_func
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_list_to_global_reduce_func
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_global_to_list_copy_func
+// CHECK-NOT: !dbg
+// CHECK: }
+// CHECK-DAG: define internal void @_omp_reduction_global_to_list_reduce_func
+// CHECK-NOT: !dbg
+// CHECK: }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir b/mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir
new file mode 100644
index 0000000000000..9abef003d6183
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-canonical_loop.mlir
@@ -0,0 +1,175 @@
+// Test lowering of standalone omp.canonical_loop
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: define void @anon_loop(
+// CHECK-SAME:    ptr %[[ptr:.+]],
+// CHECK-SAME:    i32 %[[tc:.+]]) {
+// CHECK-NEXT:    br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.preheader:
+// CHECK-NEXT:    br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.header:
+// CHECK-NEXT:    %omp_omp.loop.iv = phi i32 [ 0, %omp_omp.loop.preheader ], [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:    br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.cond:
+// CHECK-NEXT:    %omp_omp.loop.cmp = icmp ult i32 %omp_omp.loop.iv, %[[tc]]
+// CHECK-NEXT:    br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.body:
+// CHECK-NEXT:    br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp.loop.region:
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[ptr]], align 4
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp.region.cont:
+// CHECK-NEXT:    br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.inc:
+// CHECK-NEXT:    %omp_omp.loop.next = add nuw i32 %omp_omp.loop.iv, 1
+// CHECK-NEXT:    br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.exit:
+// CHECK-NEXT:    br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.after:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+llvm.func @anon_loop(%ptr: !llvm.ptr, %tc : i32) -> () {
+  omp.canonical_loop %iv : i32 in range(%tc) {
+    %val = llvm.mlir.constant(42.0 : f32) : f32
+    llvm.store %val, %ptr : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+
+
+// CHECK-LABEL: define void @trivial_loop(
+// CHECK-SAME:    ptr %[[ptr:.+]],
+// CHECK-SAME:    i32 %[[tc:.+]]) {
+// CHECK-NEXT:    br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.preheader:
+// CHECK-NEXT:    br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.header:
+// CHECK-NEXT:    %omp_omp.loop.iv = phi i32 [ 0, %omp_omp.loop.preheader ], [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:    br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.cond:
+// CHECK-NEXT:    %omp_omp.loop.cmp = icmp ult i32 %omp_omp.loop.iv, %[[tc]]
+// CHECK-NEXT:    br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.body:
+// CHECK-NEXT:    br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp.loop.region:
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[ptr]], align 4
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp.region.cont:
+// CHECK-NEXT:    br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.inc:
+// CHECK-NEXT:    %omp_omp.loop.next = add nuw i32 %omp_omp.loop.iv, 1
+// CHECK-NEXT:    br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.exit:
+// CHECK-NEXT:    br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT:  omp_omp.loop.after:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+llvm.func @trivial_loop(%ptr: !llvm.ptr, %tc : i32) -> () {
+  %cli = omp.new_cli
+  omp.canonical_loop(%cli) %iv : i32 in range(%tc) {
+    %val = llvm.mlir.constant(42.0 : f32) : f32
+    llvm.store %val, %ptr : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+
+// CHECK-LABEL: define void @nested_loop(
+// CHECK-SAME:    ptr %[[ptr:.+]], i32 %[[outer_tc:.+]], i32 %[[inner_tc:.+]]) {
+// CHECK-NEXT:  br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader:
+// CHECK-NEXT:  br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header:
+// CHECK-NEXT:  %omp_omp.loop.iv = phi i32 [ 0, %omp_omp.loop.preheader ], [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:  br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond:
+// CHECK-NEXT:  %omp_omp.loop.cmp = icmp ult i32 %omp_omp.loop.iv, %[[outer_tc]]
+// CHECK-NEXT:  br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body:
+// CHECK-NEXT:  br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region:
+// CHECK-NEXT:  br label %omp_omp.loop.preheader1
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader1:
+// CHECK-NEXT:  br label %omp_omp.loop.header2
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header2:
+// CHECK-NEXT:  %omp_omp.loop.iv8 = phi i32 [ 0, %omp_omp.loop.preheader1 ], [ %omp_omp.loop.next10, %omp_omp.loop.inc5 ]
+// CHECK-NEXT:  br label %omp_omp.loop.cond3
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond3:
+// CHECK-NEXT:  %omp_omp.loop.cmp9 = icmp ult i32 %omp_omp.loop.iv8, %[[inner_tc]]
+// CHECK-NEXT:  br i1 %omp_omp.loop.cmp9, label %omp_omp.loop.body4, label %omp_omp.loop.exit6
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body4:
+// CHECK-NEXT:  br label %omp.loop.region12
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region12:
+// CHECK-NEXT:  store float 4.200000e+01, ptr %[[ptr]], align 4
+// CHECK-NEXT:  br label %omp.region.cont11
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont11:
+// CHECK-NEXT:  br label %omp_omp.loop.inc5
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc5:
+// CHECK-NEXT:  %omp_omp.loop.next10 = add nuw i32 %omp_omp.loop.iv8, 1
+// CHECK-NEXT:  br label %omp_omp.loop.header2
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit6:
+// CHECK-NEXT:  br label %omp_omp.loop.after7
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after7:
+// CHECK-NEXT:  br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont:
+// CHECK-NEXT:  br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc:
+// CHECK-NEXT:  %omp_omp.loop.next = add nuw i32 %omp_omp.loop.iv, 1
+// CHECK-NEXT:  br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit:
+// CHECK-NEXT:  br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after:
+// CHECK-NEXT:  ret void
+// CHECK-NEXT: }
+llvm.func @nested_loop(%ptr: !llvm.ptr, %outer_tc : i32, %inner_tc : i32) -> () {
+  %outer_cli = omp.new_cli
+  %inner_cli = omp.new_cli
+  omp.canonical_loop(%outer_cli) %outer_iv : i32 in range(%outer_tc) {
+    omp.canonical_loop(%inner_cli) %inner_iv : i32 in range(%inner_tc) {
+      %val = llvm.mlir.constant(42.0 : f32) : f32
+      llvm.store %val, %ptr : f32, !llvm.ptr
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir
new file mode 100644
index 0000000000000..0f0448e15f983
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic01.mlir
@@ -0,0 +1,56 @@
+// Test lowering of the omp.unroll_heuristic
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+
+// CHECK-LABEL: define void @unroll_heuristic_trivial_loop(
+// CHECK-SAME:    ptr %[[ptr:.+]], i32 %[[tc:.+]]) {
+// CHECK-NEXT:   br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader:
+// CHECK-NEXT:   br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header:
+// CHECK-NEXT:   %omp_omp.loop.iv = phi i32 [ 0, %omp_omp.loop.preheader ], [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:   br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond:
+// CHECK-NEXT:   %omp_omp.loop.cmp = icmp ult i32 %omp_omp.loop.iv, %[[tc]]
+// CHECK-NEXT:   br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body:
+// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region:
+// CHECK-NEXT:   store float 4.200000e+01, ptr %[[ptr]], align 4
+// CHECK-NEXT:   br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont:
+// CHECK-NEXT:   br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc:
+// CHECK-NEXT:   %omp_omp.loop.next = add nuw i32 %omp_omp.loop.iv, 1
+// CHECK-NEXT:   br label %omp_omp.loop.header, !llvm.loop ![[$MD1:[0-9]+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit:
+// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after:
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @unroll_heuristic_trivial_loop(%ptr: !llvm.ptr, %tc: i32) -> () {
+  %literal_cli = omp.new_cli
+  omp.canonical_loop(%literal_cli) %iv : i32 in range(%tc) {
+    %val = llvm.mlir.constant(42.0 : f32) : f32
+    llvm.store %val, %ptr : f32, !llvm.ptr
+    omp.terminator
+  }
+  omp.unroll_heuristic(%literal_cli)
+  llvm.return
+}
+
+
+// Start of metadata
+// CHECK-LABEL: !llvm.module.flags
+
+// CHECK: ![[$MD1]] = distinct !{![[$MD1]], ![[$MD2:[0-9]+]]}
+// CHECK: ![[$MD2]] = !{!"llvm.loop.unroll.enable"}
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir
new file mode 100644
index 0000000000000..f82b4990e378e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-unroll-heuristic02.mlir
@@ -0,0 +1,93 @@
+// Test lowering of the omp.unroll_heuristic
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+
+// CHECK-LABEL: define void @unroll_heuristic_nested_loop(
+// CHECK-SAME:    ptr %[[ptr:.+]], i32 %[[outer_tc:.+]], i32 %[[inner_tc:.+]]) {
+// CHECK-NEXT:   br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader:
+// CHECK-NEXT:   br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header:
+// CHECK-NEXT:   %omp_omp.loop.iv = phi i32 [ 0, %omp_omp.loop.preheader ], [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:   br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond:
+// CHECK-NEXT:   %omp_omp.loop.cmp = icmp ult i32 %omp_omp.loop.iv, %[[outer_tc]]
+// CHECK-NEXT:   br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body:
+// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region:
+// CHECK-NEXT:   br label %omp_omp.loop.preheader1
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader1:
+// CHECK-NEXT:   br label %omp_omp.loop.header2
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header2:
+// CHECK-NEXT:   %omp_omp.loop.iv8 = phi i32 [ 0, %omp_omp.loop.preheader1 ], [ %omp_omp.loop.next10, %omp_omp.loop.inc5 ]
+// CHECK-NEXT:   br label %omp_omp.loop.cond3
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond3:
+// CHECK-NEXT:   %omp_omp.loop.cmp9 = icmp ult i32 %omp_omp.loop.iv8, %[[inner_tc]]
+// CHECK-NEXT:   br i1 %omp_omp.loop.cmp9, label %omp_omp.loop.body4, label %omp_omp.loop.exit6
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body4:
+// CHECK-NEXT:   br label %omp.loop.region12
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region12:
+// CHECK-NEXT:   store float 4.200000e+01, ptr %[[ptr]], align 4
+// CHECK-NEXT:   br label %omp.region.cont11
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont11:
+// CHECK-NEXT:   br label %omp_omp.loop.inc5
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc5:
+// CHECK-NEXT:   %omp_omp.loop.next10 = add nuw i32 %omp_omp.loop.iv8, 1
+// CHECK-NEXT:   br label %omp_omp.loop.header2, !llvm.loop ![[$MD1:[0-9]+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit6:
+// CHECK-NEXT:   br label %omp_omp.loop.after7
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after7:
+// CHECK-NEXT:   br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont:
+// CHECK-NEXT:   br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc:
+// CHECK-NEXT:   %omp_omp.loop.next = add nuw i32 %omp_omp.loop.iv, 1
+// CHECK-NEXT:   br label %omp_omp.loop.header, !llvm.loop ![[$MD3:[0-9]+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit:
+// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after:
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+llvm.func @unroll_heuristic_nested_loop(%ptr: !llvm.ptr, %outer_tc: i32, %inner_tc: i32) -> () {
+  %outer_cli = omp.new_cli
+  %inner_cli = omp.new_cli
+  omp.canonical_loop(%outer_cli) %outer_iv : i32 in range(%outer_tc) {
+    omp.canonical_loop(%inner_cli) %inner_iv : i32 in range(%inner_tc) {
+      %val = llvm.mlir.constant(42.0 : f32) : f32
+      llvm.store %val, %ptr : f32, !llvm.ptr
+      omp.terminator
+    }
+    omp.terminator
+  }
+  omp.unroll_heuristic(%outer_cli)
+  omp.unroll_heuristic(%inner_cli)
+  llvm.return
+}
+
+
+// Start of metadata
+// CHECK-LABEL: !llvm.module.flags
+
+// CHECK: ![[$MD1]] = distinct !{![[$MD1]], ![[$MD2:[0-9]+]]}
+// CHECK: ![[$MD2]] = !{!"llvm.loop.unroll.enable"}
+// CHECK: ![[$MD3]] = distinct !{![[$MD3]], ![[$MD2]]}
+
diff --git a/mlir/test/Target/LLVMIR/x86vector.mlir b/mlir/test/Target/LLVMIR/x86vector.mlir
index d11dc89bdc7c9..74ae2424964b1 100644
--- a/mlir/test/Target/LLVMIR/x86vector.mlir
+++ b/mlir/test/Target/LLVMIR/x86vector.mlir
@@ -234,3 +234,19 @@ func.func @LLVM_x86_avx_dp_ps_256(
   %0 = x86vector.avx.intr.dot %a, %b : vector<8xf32>
   return %0 : vector<8xf32>
 }
+
+// CHECK-LABEL: define <4 x i32> @LLVM_x86_avx2_vpdpbssd_128
+func.func @LLVM_x86_avx2_vpdpbssd_128(%w: vector<4xi32>, %a: vector<16xi8>,
+    %b: vector<16xi8>) -> vector<4xi32> {
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<16xi8> -> vector<4xi32>
+  return %0 : vector<4xi32>
+}
+
+// CHECK-LABEL: define <8 x i32> @LLVM_x86_avx2_vpdpbssd_256
+func.func @LLVM_x86_avx2_vpdpbssd_256(%w: vector<8xi32>, %a: vector<32xi8>,
+    %b: vector<32xi8>) -> vector<8xi32> {
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(
+  %0 = x86vector.avx.dot.i8 %w, %a, %b : vector<32xi8> -> vector<8xi32>
+  return %0 : vector<8xi32>
+}
diff --git a/mlir/test/Target/SPIRV/arithmetic-ops.mlir b/mlir/test/Target/SPIRV/arithmetic-ops.mlir
index b1ea13c6854fd..b80e17f979daa 100644
--- a/mlir/test/Target/SPIRV/arithmetic-ops.mlir
+++ b/mlir/test/Target/SPIRV/arithmetic-ops.mlir
@@ -86,4 +86,9 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.VectorTimesScalar %arg0, %arg1 : (vector<4xf32>, f32) -> vector<4xf32>
     spirv.Return
   }
+  spirv.func @dot_bf16(%arg0: vector<4xbf16>, %arg1: vector<4xbf16>) "None" {
+    // CHECK: spirv.Dot %{{.+}}, %{{.+}} : vector<4xbf16> -> bf16
+    %0 = spirv.Dot %arg0, %arg1 : vector<4xbf16> -> bf16
+    spirv.Return
+  }
 }
diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
index 4ca5974ed5a49..418c884dc03b3 100644
--- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
@@ -29,6 +29,7 @@ set(LIBS
   MLIRTranslateLib
   MLIRVectorDialect
   MLIRVectorToLLVMPass
+  MLIRXeVMDialect
   )
 
 add_mlir_library(MLIRGPUTestPasses
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index 4d825e2f0a8cc..382da592d0079 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -431,4 +431,11 @@ def SlashAttr: Test_Attr<"Slash">{
   let hasCustomAssemblyFormat = 1;
 }
 
+def TestCustomStorageCtorAttr : Test_Attr<"TestCustomStorageCtorAttr"> {
+    let mnemonic = "custom_storage_ctor_attr";
+    let parameters = (ins "int":$value);
+    let assemblyFormat = "`<` $value `>`";
+    let hasStorageCustomConstructor = 1;
+}
+
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index 4f6655d0b2978..b31e90fc9ca91 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -515,6 +515,18 @@ void SlashAttr::print(AsmPrinter &printer) const {
   printer << "<" << getLhs() << " / " << getRhs() << ">";
 }
 
+//===----------------------------------------------------------------------===//
+// TestCustomStorageCtorAttr
+//===----------------------------------------------------------------------===//
+
+test::detail::TestCustomStorageCtorAttrAttrStorage *
+test::detail::TestCustomStorageCtorAttrAttrStorage::construct(
+    mlir::StorageUniquer::StorageAllocator &, std::tuple<int> &&) {
+  // Note: this tests linker error ("undefined symbol"), the actual
+  // implementation is not important.
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // TestDialect
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 03261f37c815d..ea20597231d58 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -352,6 +352,13 @@ def TestTypeCustomString : Test_Type<"TestTypeCustomString"> {
                               custom<BarString>(ref($foo)) `>` }];
 }
 
+def TestCustomStorageCtor : Test_Type<"TestCustomStorageCtor"> {
+    let mnemonic = "custom_storage_ctor_type";
+    let parameters = (ins "int":$value);
+    let assemblyFormat = "`<` $value `>`";
+    let hasStorageCustomConstructor = 1;
+}
+
 def TestTypeOptionalString : Test_Type<"TestTypeOptionalString"> {
   let parameters = (ins StringRefParameter<"description", [{"default"}]>:$str);
   let mnemonic = "optional_type_string";
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 2fc2f90ef6bc0..bea043f56fe21 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -392,6 +392,14 @@ getCustomAssemblyFormatDynamicType(TestDialect *testDialect) {
                                     std::move(parser), std::move(printer));
 }
 
+test::detail::TestCustomStorageCtorTypeStorage *
+test::detail::TestCustomStorageCtorTypeStorage::construct(
+    mlir::StorageUniquer::StorageAllocator &, std::tuple<int> &&) {
+  // Note: this tests linker error ("undefined symbol"), the actual
+  // implementation is not important.
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // TestDialect
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/mlir-tblgen/attrdefs.td b/mlir/test/mlir-tblgen/attrdefs.td
index adec90dc5a371..d47411d6e860a 100644
--- a/mlir/test/mlir-tblgen/attrdefs.td
+++ b/mlir/test/mlir-tblgen/attrdefs.td
@@ -186,3 +186,16 @@ def I_TestGenMnemonicAliasAttr : TestAttr<"TestGenMnemonicAlias"> {
 // DEF-NEXT: os << "test_gen_mnemonic_alias";
 // DEF-NEXT: return ::mlir::OpAsmAliasResult::OverridableAlias;
 // DEF-NEXT: }
+
+def J_CustomStorageCtorAttr : AttrDef<Test_Dialect, "CustomStorageCtorAttr"> {
+  let attrName = "test_custom_storage_ctor_attr";
+  let parameters = (ins "bool":$flag);
+  let hasStorageCustomConstructor = 1;
+}
+
+// Note: ';' at the end of construct method declaration is important - otherwise
+// one cannot provide custom definition
+
+// DEF-LABEL: struct CustomStorageCtorAttrAttrStorage : public ::mlir::AttributeStorage
+// DEF: static CustomStorageCtorAttrAttrStorage *construct
+// DEF-SAME: (::mlir::AttributeStorageAllocator &allocator, KeyTy &&tblgenKey);
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index d9aa901ee2b28..dbae2143b920a 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -668,10 +668,10 @@ void DefGen::emitHashKey() {
 }
 
 void DefGen::emitConstruct() {
-  Method *construct = storageCls->addMethod<Method::Inline>(
+  Method *construct = storageCls->addMethod(
       strfmt("{0} *", def.getStorageClassName()), "construct",
       def.hasStorageCustomConstructor() ? Method::StaticDeclaration
-                                        : Method::Static,
+                                        : Method::StaticInline,
       MethodParameter(strfmt("::mlir::{0}StorageAllocator &", valueType),
                       "allocator"),
       MethodParameter("KeyTy &&", "tblgenKey"));
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 7cb3016afd597..1e9537452820d 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -6,25 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains Offload API definitions related to loading and launching
-// kernels
+// This file contains Offload API definitions related to launching kernels
 //
 //===----------------------------------------------------------------------===//
 
-def : Function {
-    let name = "olGetKernel";
-    let desc = "Get a kernel from the function identified by `KernelName` in the given program.";
-    let details = [
-        "Symbol handles are owned by the program and do not need to be manually destroyed."
-    ];
-    let params = [
-        Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>,
-        Param<"const char*", "KernelName", "name of the kernel entry point in the program", PARAM_IN>,
-        Param<"ol_symbol_handle_t*", "Kernel", "output pointer for the fetched kernel", PARAM_OUT>
-    ];
-    let returns = [];
-}
-
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/API/Symbol.td b/offload/liboffload/API/Symbol.td
index cf4d45b09f035..2e94d703809e7 100644
--- a/offload/liboffload/API/Symbol.td
+++ b/offload/liboffload/API/Symbol.td
@@ -15,5 +15,77 @@ def : Enum {
   let desc = "The kind of a symbol";
   let etors =[
     Etor<"KERNEL", "a kernel object">,
+    Etor<"GLOBAL_VARIABLE", "a global variable">,
+  ];
+}
+
+def : Function {
+    let name = "olGetSymbol";
+    let desc = "Get a symbol (kernel or global variable) identified by `Name` in the given program.";
+    let details = [
+        "Symbol handles are owned by the program and do not need to be manually destroyed."
+    ];
+    let params = [
+        Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>,
+        Param<"const char*", "Name", "name of the symbol to look up", PARAM_IN>,
+        Param<"ol_symbol_kind_t", "Kind", "symbol kind to look up", PARAM_IN>,
+        Param<"ol_symbol_handle_t*", "Symbol", "output pointer for the symbol", PARAM_OUT>,
+    ];
+    let returns = [];
+}
+
+def : Enum {
+  let name = "ol_symbol_info_t";
+  let desc = "Supported symbol info.";
+  let is_typed = 1;
+  let etors = [
+    TaggedEtor<"KIND", "ol_symbol_kind_t", "The kind of this symbol.">,
+    TaggedEtor<"GLOBAL_VARIABLE_ADDRESS", "void *", "The address in memory for this global variable.">,
+    TaggedEtor<"GLOBAL_VARIABLE_SIZE", "size_t", "The size in bytes for this global variable.">,
+  ];
+}
+
+def : Function {
+  let name = "olGetSymbolInfo";
+  let desc = "Queries the given property of the symbol.";
+  let details = [
+    "`olGetSymbolInfoSize` can be used to query the storage size "
+    "required for the given query."
+  ];
+  let params = [
+    Param<"ol_symbol_handle_t", "Symbol", "handle of the symbol", PARAM_IN>,
+    Param<"ol_symbol_info_t", "PropName", "type of the info to retrieve", PARAM_IN>,
+    Param<"size_t", "PropSize", "the number of bytes pointed to by PropValue.", PARAM_IN>,
+    TypeTaggedParam<"void*", "PropValue", "array of bytes holding the info. "
+      "If PropSize is not equal to or greater to the real number of bytes needed to return the info "
+      "then the OL_ERRC_INVALID_SIZE error is returned and PropValue is not used.", PARAM_OUT,
+      TypeInfo<"PropName" , "PropSize">>
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", [
+      "`PropSize == 0`",
+      "If `PropSize` is less than the real number of bytes needed to return the info."
+    ]>,
+    Return<"OL_ERRC_SYMBOL_KIND", [
+      "If the requested info isn't applicable to the type of symbol."
+    ]>,
+    Return<"OL_ERRC_INVALID_SYMBOL">
+  ];
+}
+
+def : Function {
+  let name = "olGetSymbolInfoSize";
+  let desc = "Returns the storage size of the given symbol query.";
+  let details = [];
+  let params = [
+    Param<"ol_symbol_handle_t", "Symbol", "handle of the symbol", PARAM_IN>,
+    Param<"ol_symbol_info_t", "PropName", "type of the info to query", PARAM_IN>,
+    Param<"size_t*", "PropSizeRet", "pointer to the number of bytes required to store the query", PARAM_OUT>
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SYMBOL">,
+    Return<"OL_ERRC_SYMBOL_KIND", [
+      "If the requested info isn't applicable to the type of symbol."
+    ]>,
   ];
 }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index fa5d18c044048..17a2b00cb7140 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -91,7 +91,9 @@ struct ol_program_impl_t {
 struct ol_symbol_impl_t {
   ol_symbol_impl_t(GenericKernelTy *Kernel)
       : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL) {}
-  std::variant<GenericKernelTy *> PluginImpl;
+  ol_symbol_impl_t(GlobalTy &&Global)
+      : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE) {}
+  std::variant<GenericKernelTy *, GlobalTy> PluginImpl;
   ol_symbol_kind_t Kind;
 };
 
@@ -660,24 +662,6 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
-Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
-                       ol_symbol_handle_t *Kernel) {
-
-  auto &Device = Program->Image->getDevice();
-  auto KernelImpl = Device.constructKernel(KernelName);
-  if (!KernelImpl)
-    return KernelImpl.takeError();
-
-  if (auto Err = KernelImpl->init(Device, *Program->Image))
-    return Err;
-
-  *Kernel = Program->Symbols
-                .emplace_back(std::make_unique<ol_symbol_impl_t>(&*KernelImpl))
-                .get();
-
-  return Error::success();
-}
-
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_symbol_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -726,5 +710,90 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   return Error::success();
 }
 
+Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
+                       ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) {
+  auto &Device = Program->Image->getDevice();
+
+  switch (Kind) {
+  case OL_SYMBOL_KIND_KERNEL: {
+    auto KernelImpl = Device.constructKernel(Name);
+    if (!KernelImpl)
+      return KernelImpl.takeError();
+
+    if (auto Err = KernelImpl->init(Device, *Program->Image))
+      return Err;
+
+    *Symbol =
+        Program->Symbols
+            .emplace_back(std::make_unique<ol_symbol_impl_t>(&*KernelImpl))
+            .get();
+    return Error::success();
+  }
+  case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
+    GlobalTy GlobalObj{Name};
+    if (auto Res = Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice(
+            Device, *Program->Image, GlobalObj))
+      return Res;
+
+    *Symbol = Program->Symbols
+                  .emplace_back(
+                      std::make_unique<ol_symbol_impl_t>(std::move(GlobalObj)))
+                  .get();
+
+    return Error::success();
+  }
+  default:
+    return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+                              "getSymbol kind enum '%i' is invalid", Kind);
+  }
+}
+
+Error olGetSymbolInfoImplDetail(ol_symbol_handle_t Symbol,
+                                ol_symbol_info_t PropName, size_t PropSize,
+                                void *PropValue, size_t *PropSizeRet) {
+  InfoWriter Info(PropSize, PropValue, PropSizeRet);
+
+  auto CheckKind = [&](ol_symbol_kind_t Required) {
+    if (Symbol->Kind != Required) {
+      std::string ErrBuffer;
+      llvm::raw_string_ostream(ErrBuffer)
+          << PropName << ": Expected a symbol of Kind " << Required
+          << " but given a symbol of Kind " << Symbol->Kind;
+      return Plugin::error(ErrorCode::SYMBOL_KIND, ErrBuffer.c_str());
+    }
+    return Plugin::success();
+  };
+
+  switch (PropName) {
+  case OL_SYMBOL_INFO_KIND:
+    return Info.write<ol_symbol_kind_t>(Symbol->Kind);
+  case OL_SYMBOL_INFO_GLOBAL_VARIABLE_ADDRESS:
+    if (auto Err = CheckKind(OL_SYMBOL_KIND_GLOBAL_VARIABLE))
+      return Err;
+    return Info.write<void *>(std::get<GlobalTy>(Symbol->PluginImpl).getPtr());
+  case OL_SYMBOL_INFO_GLOBAL_VARIABLE_SIZE:
+    if (auto Err = CheckKind(OL_SYMBOL_KIND_GLOBAL_VARIABLE))
+      return Err;
+    return Info.write<size_t>(std::get<GlobalTy>(Symbol->PluginImpl).getSize());
+  default:
+    return createOffloadError(ErrorCode::INVALID_ENUMERATION,
+                              "olGetSymbolInfo enum '%i' is invalid", PropName);
+  }
+
+  return Error::success();
+}
+
+Error olGetSymbolInfo_impl(ol_symbol_handle_t Symbol, ol_symbol_info_t PropName,
+                           size_t PropSize, void *PropValue) {
+
+  return olGetSymbolInfoImplDetail(Symbol, PropName, PropSize, PropValue,
+                                   nullptr);
+}
+
+Error olGetSymbolInfoSize_impl(ol_symbol_handle_t Symbol,
+                               ol_symbol_info_t PropName, size_t *PropSizeRet) {
+  return olGetSymbolInfoImplDetail(Symbol, PropName, 0, nullptr, PropSizeRet);
+}
+
 } // namespace offload
 } // namespace llvm
diff --git a/offload/tools/offload-tblgen/PrintGen.cpp b/offload/tools/offload-tblgen/PrintGen.cpp
index d1189688a90a3..89d7c820426cf 100644
--- a/offload/tools/offload-tblgen/PrintGen.cpp
+++ b/offload/tools/offload-tblgen/PrintGen.cpp
@@ -74,8 +74,12 @@ inline void printTagged(llvm::raw_ostream &os, const void *ptr, {0} value, size_
     if (Type == "char[]") {
       OS << formatv(TAB_2 "printPtr(os, (const char*) ptr);\n");
     } else {
-      OS << formatv(TAB_2 "const {0} * const tptr = (const {0} * const)ptr;\n",
-                    Type);
+      if (Type == "void *")
+        OS << formatv(TAB_2 "void * const * const tptr = (void * "
+                            "const * const)ptr;\n");
+      else
+        OS << formatv(
+            TAB_2 "const {0} * const tptr = (const {0} * const)ptr;\n", Type);
       // TODO: Handle other cases here
       OS << TAB_2 "os << (const void *)tptr << \" (\";\n";
       if (Type.ends_with("*")) {
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 93e5fd2f6cd26..d76338612210d 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -19,7 +19,6 @@ add_offload_unittest("init"
 target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
 
 add_offload_unittest("kernel"
-    kernel/olGetKernel.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
@@ -41,3 +40,8 @@ add_offload_unittest("queue"
     queue/olDestroyQueue.cpp
     queue/olGetQueueInfo.cpp
     queue/olGetQueueInfoSize.cpp)
+
+add_offload_unittest("symbol"
+    symbol/olGetSymbol.cpp
+    symbol/olGetSymbolInfo.cpp
+    symbol/olGetSymbolInfoSize.cpp)
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index e443d9761f30b..16ff3c4fe60a7 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -91,9 +91,12 @@ struct OffloadPlatformTest : OffloadDeviceTest {
 // Fixture for a generic program test. If you want a different program, use
 // offloadQueueTest and create your own program handle with the binary you want.
 struct OffloadProgramTest : OffloadDeviceTest {
-  void SetUp() override {
+  void SetUp() override { SetUpWith("foo"); }
+
+  void SetUpWith(const char *ProgramName) {
     RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp());
-    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+    ASSERT_TRUE(
+        TestEnvironment::loadDeviceBinary(ProgramName, Device, DeviceBin));
     ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
     ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
                                    DeviceBin->getBufferSize(), &Program));
@@ -113,7 +116,7 @@ struct OffloadProgramTest : OffloadDeviceTest {
 struct OffloadKernelTest : OffloadProgramTest {
   void SetUp() override {
     RETURN_ON_FATAL_FAILURE(OffloadProgramTest::SetUp());
-    ASSERT_SUCCESS(olGetKernel(Program, "foo", &Kernel));
+    ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
   }
 
   void TearDown() override {
@@ -123,6 +126,20 @@ struct OffloadKernelTest : OffloadProgramTest {
   ol_symbol_handle_t Kernel = nullptr;
 };
 
+struct OffloadGlobalTest : OffloadProgramTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadProgramTest::SetUpWith("global"));
+    ASSERT_SUCCESS(olGetSymbol(Program, "global",
+                               OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
+  }
+
+  void TearDown() override {
+    RETURN_ON_FATAL_FAILURE(OffloadProgramTest::TearDown());
+  }
+
+  ol_symbol_handle_t Global = nullptr;
+};
+
 struct OffloadQueueTest : OffloadDeviceTest {
   void SetUp() override {
     RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp());
diff --git a/offload/unittests/OffloadAPI/device_code/global.c b/offload/unittests/OffloadAPI/device_code/global.c
index b30e406fb98c7..9f27f9424324f 100644
--- a/offload/unittests/OffloadAPI/device_code/global.c
+++ b/offload/unittests/OffloadAPI/device_code/global.c
@@ -1,6 +1,7 @@
 #include <gpuintrin.h>
 #include <stdint.h>
 
+[[gnu::visibility("default")]]
 uint32_t global[64];
 
 __gpu_kernel void write() {
diff --git a/offload/unittests/OffloadAPI/kernel/olGetKernel.cpp b/offload/unittests/OffloadAPI/kernel/olGetKernel.cpp
deleted file mode 100644
index 34870f1fbf0a3..0000000000000
--- a/offload/unittests/OffloadAPI/kernel/olGetKernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===------- Offload API tests - olGetKernel ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "../common/Fixtures.hpp"
-#include <OffloadAPI.h>
-#include <gtest/gtest.h>
-
-using olGetKernelTest = OffloadProgramTest;
-OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetKernelTest);
-
-TEST_P(olGetKernelTest, Success) {
-  ol_symbol_handle_t Kernel = nullptr;
-  ASSERT_SUCCESS(olGetKernel(Program, "foo", &Kernel));
-  ASSERT_NE(Kernel, nullptr);
-}
-
-TEST_P(olGetKernelTest, InvalidNullProgram) {
-  ol_symbol_handle_t Kernel = nullptr;
-  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
-               olGetKernel(nullptr, "foo", &Kernel));
-}
-
-TEST_P(olGetKernelTest, InvalidNullKernelPointer) {
-  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
-               olGetKernel(Program, "foo", nullptr));
-}
-
-// Error code returning from plugin interface not yet supported
-TEST_P(olGetKernelTest, InvalidKernelName) {
-  ol_symbol_handle_t Kernel = nullptr;
-  ASSERT_ERROR(OL_ERRC_NOT_FOUND,
-               olGetKernel(Program, "invalid_kernel_name", &Kernel));
-}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index acda4795edec2..e7e608f2a64d4 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -40,7 +40,8 @@ struct LaunchKernelTestBase : OffloadQueueTest {
 struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
   void SetUpKernel(const char *kernel) {
     RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
-    ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
+    ASSERT_SUCCESS(
+        olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
   }
 
   ol_symbol_handle_t Kernel = nullptr;
@@ -67,7 +68,8 @@ struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
     Kernels.resize(kernels.size());
     size_t I = 0;
     for (auto K : kernels)
-      ASSERT_SUCCESS(olGetKernel(Program, K, &Kernels[I++]));
+      ASSERT_SUCCESS(
+          olGetSymbol(Program, K, OL_SYMBOL_KIND_KERNEL, &Kernels[I++]));
   }
 
   std::vector<ol_symbol_handle_t> Kernels;
@@ -223,6 +225,15 @@ TEST_P(olLaunchKernelGlobalTest, Success) {
   ASSERT_SUCCESS(olMemFree(Mem));
 }
 
+TEST_P(olLaunchKernelGlobalTest, InvalidNotAKernel) {
+  ol_symbol_handle_t Global = nullptr;
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
+  ASSERT_ERROR(
+      OL_ERRC_SYMBOL_KIND,
+      olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs, nullptr));
+}
+
 TEST_P(olLaunchKernelGlobalCtorTest, Success) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
index c1762b451b81d..c1fb6df9bad0d 100644
--- a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
@@ -13,6 +13,32 @@
 using olMemcpyTest = OffloadQueueTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemcpyTest);
 
+struct olMemcpyGlobalTest : OffloadGlobalTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadGlobalTest::SetUp());
+    ASSERT_SUCCESS(
+        olGetSymbol(Program, "read", OL_SYMBOL_KIND_KERNEL, &ReadKernel));
+    ASSERT_SUCCESS(
+        olGetSymbol(Program, "write", OL_SYMBOL_KIND_KERNEL, &WriteKernel));
+    ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
+    ASSERT_SUCCESS(olGetSymbolInfo(
+        Global, OL_SYMBOL_INFO_GLOBAL_VARIABLE_ADDRESS, sizeof(Addr), &Addr));
+
+    LaunchArgs.Dimensions = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
+
+    LaunchArgs.DynSharedMemory = 0;
+  }
+
+  ol_kernel_launch_size_args_t LaunchArgs{};
+  void *Addr;
+  ol_symbol_handle_t ReadKernel;
+  ol_symbol_handle_t WriteKernel;
+  ol_queue_handle_t Queue;
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemcpyGlobalTest);
+
 TEST_P(olMemcpyTest, SuccessHtoD) {
   constexpr size_t Size = 1024;
   void *Alloc;
@@ -105,3 +131,82 @@ TEST_P(olMemcpyTest, SuccessSizeZero) {
   ASSERT_SUCCESS(
       olMemcpy(nullptr, Output.data(), Host, Input.data(), Host, 0, nullptr));
 }
+
+TEST_P(olMemcpyGlobalTest, SuccessRoundTrip) {
+  void *SourceMem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            64 * sizeof(uint32_t), &SourceMem));
+  uint32_t *SourceData = (uint32_t *)SourceMem;
+  for (auto I = 0; I < 64; I++)
+    SourceData[I] = I;
+
+  void *DestMem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            64 * sizeof(uint32_t), &DestMem));
+
+  ASSERT_SUCCESS(olMemcpy(Queue, Addr, Device, SourceMem, Host,
+                          64 * sizeof(uint32_t), nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+  ASSERT_SUCCESS(olMemcpy(Queue, DestMem, Host, Addr, Device,
+                          64 * sizeof(uint32_t), nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *DestData = (uint32_t *)DestMem;
+  for (uint32_t I = 0; I < 64; I++)
+    ASSERT_EQ(DestData[I], I);
+
+  ASSERT_SUCCESS(olMemFree(DestMem));
+  ASSERT_SUCCESS(olMemFree(SourceMem));
+}
+
+TEST_P(olMemcpyGlobalTest, SuccessWrite) {
+  void *SourceMem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t),
+                            &SourceMem));
+  uint32_t *SourceData = (uint32_t *)SourceMem;
+  for (auto I = 0; I < 64; I++)
+    SourceData[I] = I;
+
+  void *DestMem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t),
+                            &DestMem));
+  struct {
+    void *Mem;
+  } Args{DestMem};
+
+  ASSERT_SUCCESS(olMemcpy(Queue, Addr, Device, SourceMem, Host,
+                          64 * sizeof(uint32_t), nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, ReadKernel, &Args, sizeof(Args),
+                                &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *DestData = (uint32_t *)DestMem;
+  for (uint32_t I = 0; I < 64; I++)
+    ASSERT_EQ(DestData[I], I);
+
+  ASSERT_SUCCESS(olMemFree(DestMem));
+  ASSERT_SUCCESS(olMemFree(SourceMem));
+}
+
+TEST_P(olMemcpyGlobalTest, SuccessRead) {
+  void *DestMem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t),
+                            &DestMem));
+
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, WriteKernel, nullptr, 0,
+                                &LaunchArgs, nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+  ASSERT_SUCCESS(olMemcpy(Queue, DestMem, Host, Addr, Device,
+                          64 * sizeof(uint32_t), nullptr));
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *DestData = (uint32_t *)DestMem;
+  for (uint32_t I = 0; I < 64; I++)
+    ASSERT_EQ(DestData[I], I * 2);
+
+  ASSERT_SUCCESS(olMemFree(DestMem));
+}
diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
new file mode 100644
index 0000000000000..5e87ab5b29621
--- /dev/null
+++ b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
@@ -0,0 +1,93 @@
+//===------- Offload API tests - olGetSymbol ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olGetSymbolKernelTest = OffloadProgramTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolKernelTest);
+
+struct olGetSymbolGlobalTest : OffloadQueueTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("global", Device, DeviceBin));
+    ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+                                   DeviceBin->getBufferSize(), &Program));
+  }
+
+  void TearDown() override {
+    if (Program) {
+      olDestroyProgram(Program);
+    }
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ol_program_handle_t Program = nullptr;
+  ol_kernel_launch_size_args_t LaunchArgs{};
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolGlobalTest);
+
+TEST_P(olGetSymbolKernelTest, Success) {
+  ol_symbol_handle_t Kernel = nullptr;
+  ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
+  ASSERT_NE(Kernel, nullptr);
+}
+
+TEST_P(olGetSymbolKernelTest, InvalidNullProgram) {
+  ol_symbol_handle_t Kernel = nullptr;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetSymbol(nullptr, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
+}
+
+TEST_P(olGetSymbolKernelTest, InvalidNullKernelPointer) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, nullptr));
+}
+
+TEST_P(olGetSymbolKernelTest, InvalidKernelName) {
+  ol_symbol_handle_t Kernel = nullptr;
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND, olGetSymbol(Program, "invalid_kernel_name",
+                                              OL_SYMBOL_KIND_KERNEL, &Kernel));
+}
+
+TEST_P(olGetSymbolKernelTest, InvalidKind) {
+  ol_symbol_handle_t Kernel = nullptr;
+  ASSERT_ERROR(
+      OL_ERRC_INVALID_ENUMERATION,
+      olGetSymbol(Program, "foo", OL_SYMBOL_KIND_FORCE_UINT32, &Kernel));
+}
+
+TEST_P(olGetSymbolGlobalTest, Success) {
+  ol_symbol_handle_t Global = nullptr;
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
+  ASSERT_NE(Global, nullptr);
+}
+
+TEST_P(olGetSymbolGlobalTest, InvalidNullProgram) {
+  ol_symbol_handle_t Global = nullptr;
+  ASSERT_ERROR(
+      OL_ERRC_INVALID_NULL_HANDLE,
+      olGetSymbol(nullptr, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
+}
+
+TEST_P(olGetSymbolGlobalTest, InvalidNullGlobalPointer) {
+  ASSERT_ERROR(
+      OL_ERRC_INVALID_NULL_POINTER,
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, nullptr));
+}
+
+TEST_P(olGetSymbolGlobalTest, InvalidGlobalName) {
+  ol_symbol_handle_t Global = nullptr;
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND,
+               olGetSymbol(Program, "invalid_global",
+                           OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
+}
diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbolInfo.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbolInfo.cpp
new file mode 100644
index 0000000000000..ed8f4716974cd
--- /dev/null
+++ b/offload/unittests/OffloadAPI/symbol/olGetSymbolInfo.cpp
@@ -0,0 +1,93 @@
+//===------- Offload API tests - olGetSymbolInfo --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <OffloadAPI.h>
+
+#include "../common/Fixtures.hpp"
+
+using olGetSymbolInfoKernelTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolInfoKernelTest);
+
+using olGetSymbolInfoGlobalTest = OffloadGlobalTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolInfoGlobalTest);
+
+TEST_P(olGetSymbolInfoKernelTest, SuccessKind) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_SUCCESS(olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_KIND,
+                                 sizeof(RetrievedKind), &RetrievedKind));
+  ASSERT_EQ(RetrievedKind, OL_SYMBOL_KIND_KERNEL);
+}
+
+TEST_P(olGetSymbolInfoGlobalTest, SuccessKind) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_SUCCESS(olGetSymbolInfo(Global, OL_SYMBOL_INFO_KIND,
+                                 sizeof(RetrievedKind), &RetrievedKind));
+  ASSERT_EQ(RetrievedKind, OL_SYMBOL_KIND_GLOBAL_VARIABLE);
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidAddress) {
+  void *RetrievedAddr;
+  ASSERT_ERROR(OL_ERRC_SYMBOL_KIND,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_GLOBAL_VARIABLE_ADDRESS,
+                               sizeof(RetrievedAddr), &RetrievedAddr));
+}
+
+TEST_P(olGetSymbolInfoGlobalTest, SuccessAddress) {
+  void *RetrievedAddr = nullptr;
+  ASSERT_SUCCESS(olGetSymbolInfo(Global, OL_SYMBOL_INFO_GLOBAL_VARIABLE_ADDRESS,
+                                 sizeof(RetrievedAddr), &RetrievedAddr));
+  ASSERT_NE(RetrievedAddr, nullptr);
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidSize) {
+  size_t RetrievedSize;
+  ASSERT_ERROR(OL_ERRC_SYMBOL_KIND,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_GLOBAL_VARIABLE_SIZE,
+                               sizeof(RetrievedSize), &RetrievedSize));
+}
+
+TEST_P(olGetSymbolInfoGlobalTest, SuccessSize) {
+  size_t RetrievedSize = 0;
+  ASSERT_SUCCESS(olGetSymbolInfo(Global, OL_SYMBOL_INFO_GLOBAL_VARIABLE_SIZE,
+                                 sizeof(RetrievedSize), &RetrievedSize));
+  ASSERT_EQ(RetrievedSize, 64 * sizeof(uint32_t));
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidNullHandle) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetSymbolInfo(nullptr, OL_SYMBOL_INFO_KIND,
+                               sizeof(RetrievedKind), &RetrievedKind));
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidSymbolInfoEnumeration) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_ERROR(OL_ERRC_INVALID_ENUMERATION,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_FORCE_UINT32,
+                               sizeof(RetrievedKind), &RetrievedKind));
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidSizeZero) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_KIND, 0, &RetrievedKind));
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidSizeSmall) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_KIND,
+                               sizeof(RetrievedKind) - 1, &RetrievedKind));
+}
+
+TEST_P(olGetSymbolInfoKernelTest, InvalidNullPointerPropValue) {
+  ol_symbol_kind_t RetrievedKind;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetSymbolInfo(Kernel, OL_SYMBOL_INFO_KIND,
+                               sizeof(RetrievedKind), nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbolInfoSize.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbolInfoSize.cpp
new file mode 100644
index 0000000000000..ec011865cc6ad
--- /dev/null
+++ b/offload/unittests/OffloadAPI/symbol/olGetSymbolInfoSize.cpp
@@ -0,0 +1,60 @@
+//===------- Offload API tests - olGetSymbolInfoSize ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <OffloadAPI.h>
+
+#include "../common/Fixtures.hpp"
+
+using olGetSymbolInfoSizeKernelTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolInfoSizeKernelTest);
+
+using olGetSymbolInfoSizeGlobalTest = OffloadGlobalTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetSymbolInfoSizeGlobalTest);
+
+TEST_P(olGetSymbolInfoSizeKernelTest, SuccessKind) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetSymbolInfoSize(Kernel, OL_SYMBOL_INFO_KIND, &Size));
+  ASSERT_EQ(Size, sizeof(ol_symbol_kind_t));
+}
+
+TEST_P(olGetSymbolInfoSizeGlobalTest, SuccessKind) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetSymbolInfoSize(Global, OL_SYMBOL_INFO_KIND, &Size));
+  ASSERT_EQ(Size, sizeof(ol_symbol_kind_t));
+}
+
+TEST_P(olGetSymbolInfoSizeGlobalTest, SuccessAddress) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetSymbolInfoSize(
+      Global, OL_SYMBOL_INFO_GLOBAL_VARIABLE_ADDRESS, &Size));
+  ASSERT_EQ(Size, sizeof(void *));
+}
+
+TEST_P(olGetSymbolInfoSizeGlobalTest, SuccessSize) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(
+      olGetSymbolInfoSize(Global, OL_SYMBOL_INFO_GLOBAL_VARIABLE_SIZE, &Size));
+  ASSERT_EQ(Size, sizeof(size_t));
+}
+
+TEST_P(olGetSymbolInfoSizeKernelTest, InvalidNullHandle) {
+  size_t Size = 0;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetSymbolInfoSize(nullptr, OL_SYMBOL_INFO_KIND, &Size));
+}
+
+TEST_P(olGetSymbolInfoSizeKernelTest, InvalidSymbolInfoEnumeration) {
+  size_t Size = 0;
+  ASSERT_ERROR(OL_ERRC_INVALID_ENUMERATION,
+               olGetSymbolInfoSize(Kernel, OL_SYMBOL_INFO_FORCE_UINT32, &Size));
+}
+
+TEST_P(olGetSymbolInfoSizeKernelTest, InvalidNullPointer) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetSymbolInfoSize(Kernel, OL_SYMBOL_INFO_KIND, nullptr));
+}
diff --git a/third-party/siphash/include/siphash/SipHash.h b/third-party/siphash/include/siphash/SipHash.h
index 9653e9428b123..ca4fe45e4fddf 100644
--- a/third-party/siphash/include/siphash/SipHash.h
+++ b/third-party/siphash/include/siphash/SipHash.h
@@ -104,25 +104,24 @@ void siphash(const unsigned char *in, uint64_t inlen,
   switch (left) {
   case 7:
     b |= ((uint64_t)ni[6]) << 48;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 6:
     b |= ((uint64_t)ni[5]) << 40;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 5:
     b |= ((uint64_t)ni[4]) << 32;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 4:
     b |= ((uint64_t)ni[3]) << 24;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 3:
     b |= ((uint64_t)ni[2]) << 16;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 2:
     b |= ((uint64_t)ni[1]) << 8;
-    /* FALLTHRU */
+    [[fallthrough]];
   case 1:
     b |= ((uint64_t)ni[0]);
-    /* FALLTHRU */
     break;
   case 0:
     break;
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a80d8d430ec5c..d259f391069a4 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2083,11 +2083,10 @@ libc_support_library(
     name = "expxf16",
     hdrs = ["src/math/generic/expxf16.h"],
     deps = [
-        ":__support_cpp_array",
         ":__support_fputil_cast",
         ":__support_fputil_fp_bits",
         ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
+        ":__support_math_expf16_utils",
     ],
 )
 
@@ -2117,6 +2116,39 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_expf16_utils",
+    hdrs = ["src/__support/math/expf16_utils.h"],
+    deps = [
+        ":__support_cpp_array",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_macros_attributes",
+        ":llvm_libc_macros_float16_macros"
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_expf16",
+    hdrs = ["src/__support/math/expf16.h"],
+    deps = [
+        ":__support_common",
+        ":__support_libc_errno",
+        ":__support_cpp_array",
+        ":__support_fputil_cast",
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_expf16_utils",
+        ":llvm_libc_macros_float16_macros"
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -2722,6 +2754,7 @@ libc_math_function(
 libc_math_function(
     name = "expf16",
     additional_deps = [
+        ":__support_math_expf16",
         ":expxf16",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index ba9db05c651a7..2309175d04a9b 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -112,6 +112,8 @@ llvm_config_defines = os_defines + builtin_thread_pointer + select({
     "LLVM_VERSION_MINOR={}".format(LLVM_VERSION_MINOR),
     "LLVM_VERSION_PATCH={}".format(LLVM_VERSION_PATCH),
     r'LLVM_VERSION_STRING=\"{}\"'.format(PACKAGE_VERSION),
+    # Set globally in HandleLLVMOptions.cmake
+    "EXPERIMENTAL_KEY_INSTRUCTIONS",
     # These shouldn't be needed by the C++11 standard, but are for some
     # platforms (e.g. glibc < 2.18. See
     # https://sourceware.org/bugzilla/show_bug.cgi?id=15366). These are also
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 046ff102cda59..37c865ca4c4ca 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5435,6 +5435,7 @@ cc_library(
         ":Transforms",
         ":VCIXToLLVMIRTranslation",
         ":VectorDialect",
+        ":XeVMDialect",
         ":config",
         "//llvm:Core",
         "//llvm:MC",
diff --git a/utils/bazel/llvm_configs/abi-breaking.h.cmake b/utils/bazel/llvm_configs/abi-breaking.h.cmake
index 318bd015f80d2..2d27e02b1d545 100644
--- a/utils/bazel/llvm_configs/abi-breaking.h.cmake
+++ b/utils/bazel/llvm_configs/abi-breaking.h.cmake
@@ -12,9 +12,6 @@
 #ifndef LLVM_ABI_BREAKING_CHECKS_H
 #define LLVM_ABI_BREAKING_CHECKS_H
 
-// Compiler.h is required for LLVM_ABI definition.
-#include "llvm/Support/Compiler.h"
-
 /* Define to enable checks that alter the LLVM C++ ABI */
 #cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
 
@@ -46,12 +43,12 @@
 #endif
 namespace llvm {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
-LLVM_ABI extern int EnableABIBreakingChecks;
+extern int EnableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyEnableABIBreakingChecks =
     &EnableABIBreakingChecks;
 #else
-LLVM_ABI extern int DisableABIBreakingChecks;
+extern int DisableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyDisableABIBreakingChecks =
     &DisableABIBreakingChecks;